From c2405d1da64c6e4b74527f6352970b765dedd72d Mon Sep 17 00:00:00 2001 From: Pierre Marijon Date: Tue, 19 Mar 2024 16:45:08 +0100 Subject: [PATCH 1/8] chore: update pypublish github action --- .github/workflows/pypublish.yml | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pypublish.yml b/.github/workflows/pypublish.yml index 8c7bd5e..2bd286a 100644 --- a/.github/workflows/pypublish.yml +++ b/.github/workflows/pypublish.yml @@ -11,8 +11,6 @@ on: - main tags: - '*' - pull_request: - workflow_dispatch: permissions: contents: read @@ -24,8 +22,8 @@ jobs: matrix: target: [x86_64, x86, aarch64, armv7, s390x, ppc64le] steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.10' - name: Build wheels @@ -37,7 +35,7 @@ jobs: working-directory: vcf2parquet-py manylinux: auto - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: wheels path: vcf2parquet-py/dist @@ -48,8 +46,8 @@ jobs: matrix: target: [x64, x86] steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.10' architecture: ${{ matrix.target }} @@ -61,7 +59,7 @@ jobs: sccache: 'true' working-directory: vcf2parquet-py - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: wheels path: vcf2parquet-py/dist @@ -72,8 +70,8 @@ jobs: matrix: target: [x86_64, aarch64] steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.10' - name: Build wheels @@ -84,7 +82,7 @@ jobs: sccache: 'true' working-directory: vcf2parquet-py - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: wheels path: vcf2parquet-py/dist From a238c72697b34745eb8e95c4d18dd8fd592ffcac Mon Sep 17 00:00:00 2001 From: Pierre Marijon Date: Tue, 19 Mar 2024 17:35:02 +0100 Subject: [PATCH 2/8] =?UTF-8?q?chore:=C2=A0remove=20workspace?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/main.yaml | 2 +- .github/workflows/pypublish.yml | 12 +- Cargo.lock | 419 +----------------- Cargo.toml | 36 +- {vcf2parquet-py => python}/Cargo.toml | 0 {vcf2parquet-py => python}/Readme.md | 0 {vcf2parquet-py => python}/pyproject.toml | 0 {vcf2parquet-py => python}/src/error.rs | 0 {vcf2parquet-py => python}/src/lib.rs | 0 .../tests/test_vcf2parquet.py | 0 {vcf2parquet-bin/src => src}/cli.rs | 19 + {vcf2parquet-lib/src => src}/error.rs | 4 + src/lib.rs | 330 +++++++++++++- src/main.rs | 57 ++- {vcf2parquet-lib/src => src}/name2data.rs | 0 {vcf2parquet-lib/src => src}/record2chunk.rs | 0 {vcf2parquet-lib/src => src}/schema.rs | 0 tests/functional.rs | 27 +- vcf2parquet-bin/Cargo.toml | 18 - vcf2parquet-bin/src/error.rs | 80 ---- vcf2parquet-bin/src/lib.rs | 64 --- vcf2parquet-lib/Cargo.toml | 21 - vcf2parquet-lib/src/lib.rs | 327 -------------- 23 files changed, 463 insertions(+), 953 deletions(-) rename {vcf2parquet-py => python}/Cargo.toml (100%) rename {vcf2parquet-py => python}/Readme.md (100%) rename {vcf2parquet-py => python}/pyproject.toml (100%) rename {vcf2parquet-py => python}/src/error.rs (100%) rename {vcf2parquet-py => python}/src/lib.rs (100%) rename {vcf2parquet-py => python}/tests/test_vcf2parquet.py (100%) rename {vcf2parquet-bin/src => src}/cli.rs (95%) rename {vcf2parquet-lib/src => src}/error.rs (89%) rename {vcf2parquet-lib/src => src}/name2data.rs (100%) rename {vcf2parquet-lib/src => src}/record2chunk.rs (100%) rename {vcf2parquet-lib/src => src}/schema.rs (100%) delete mode 100644 vcf2parquet-bin/Cargo.toml delete mode 100644 vcf2parquet-bin/src/error.rs delete mode 100644 vcf2parquet-bin/src/lib.rs delete mode 100644 vcf2parquet-lib/Cargo.toml delete mode 100644 vcf2parquet-lib/src/lib.rs diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 8f982f7..f95d5d2 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -70,7 +70,7 @@ jobs: uses: actions/checkout@v2 - name: Generate code coverage - run: cargo +nightly tarpaulin --verbose --engine llvm --no-dead-code --all-features --workspace --timeout 120 --out xml + run: cargo +nightly tarpaulin --all-features --workspace --timeout 120 --out xml - name: Upload to codecov.io uses: codecov/codecov-action@v2 diff --git a/.github/workflows/pypublish.yml b/.github/workflows/pypublish.yml index 2bd286a..7f64d70 100644 --- a/.github/workflows/pypublish.yml +++ b/.github/workflows/pypublish.yml @@ -32,13 +32,13 @@ jobs: target: ${{ matrix.target }} args: --sdist --release --out dist --find-interpreter sccache: 'true' - working-directory: vcf2parquet-py + working-directory: python manylinux: auto - name: Upload wheels uses: actions/upload-artifact@v4 with: name: wheels - path: vcf2parquet-py/dist + path: python/dist windows: runs-on: windows-latest @@ -57,12 +57,12 @@ jobs: target: ${{ matrix.target }} args: --sdist --release --out dist --find-interpreter sccache: 'true' - working-directory: vcf2parquet-py + working-directory: python - name: Upload wheels uses: actions/upload-artifact@v4 with: name: wheels - path: vcf2parquet-py/dist + path: python/dist macos: runs-on: macos-latest @@ -80,12 +80,12 @@ jobs: target: ${{ matrix.target }} args: --sdist --release --out dist --find-interpreter sccache: 'true' - working-directory: vcf2parquet-py + working-directory: python - name: Upload wheels uses: actions/upload-artifact@v4 with: name: wheels - path: vcf2parquet-py/dist + path: python/dist release: name: Release diff --git a/Cargo.lock b/Cargo.lock index ca55af0..874476d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -100,32 +100,6 @@ dependencies = [ "serde", ] -[[package]] -name = "arrow2" -version = "0.17.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59c468daea140b747d781a1da9f7db5f0a8e6636d4af20cc539e43d05b0604fa" -dependencies = [ - "ahash", - "arrow-format", - "base64", - "bytemuck", - "chrono", - "dyn-clone", - "either", - "ethnum", - "fallible-streaming-iterator", - "foreign_vec", - "futures", - "getrandom", - "hash_hasher", - "num-traits", - "parquet2", - "rustc_version", - "simdutf8", - "streaming-iterator", -] - [[package]] name = "arrow2" version = "0.18.0" @@ -229,12 +203,6 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.5.0" @@ -385,7 +353,7 @@ version = "4.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90239a040c80f5e14809ca132ddc4176ab33d5e17e49691793296e3fcb34d72f" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "syn", @@ -446,15 +414,6 @@ version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" -[[package]] -name = "deranged" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" -dependencies = [ - "powerfmt", -] - [[package]] name = "difflib" version = "0.4.0" @@ -646,12 +605,6 @@ dependencies = [ "ahash", ] -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - [[package]] name = "heck" version = "0.5.0" @@ -668,18 +621,6 @@ dependencies = [ "hashbrown", ] -[[package]] -name = "indoc" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" - -[[package]] -name = "itoa" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" - [[package]] name = "jobserver" version = "0.1.28" @@ -716,16 +657,6 @@ version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" -[[package]] -name = "lock_api" -version = "0.4.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" -dependencies = [ - "autocfg", - "scopeguard", -] - [[package]] name = "log" version = "0.4.21" @@ -769,15 +700,6 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" -[[package]] -name = "memoffset" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" -dependencies = [ - "autocfg", -] - [[package]] name = "miniz_oxide" version = "0.7.2" @@ -871,12 +793,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "num-conv" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" - [[package]] name = "num-traits" version = "0.2.18" @@ -886,44 +802,12 @@ dependencies = [ "autocfg", ] -[[package]] -name = "num_threads" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" -dependencies = [ - "libc", -] - [[package]] name = "once_cell" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" -[[package]] -name = "parking_lot" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-targets 0.48.5", -] - [[package]] name = "parquet-format-safe" version = "0.2.4" @@ -985,18 +869,6 @@ dependencies = [ "array-init-cursor", ] -[[package]] -name = "portable-atomic" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" - -[[package]] -name = "powerfmt" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" - [[package]] name = "predicates" version = "3.1.0" @@ -1033,69 +905,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "pyo3" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53bdbb96d49157e65d45cc287af5f32ffadd5f4761438b527b055fb0d4bb8233" -dependencies = [ - "cfg-if", - "indoc", - "libc", - "memoffset", - "parking_lot", - "portable-atomic", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", -] - -[[package]] -name = "pyo3-build-config" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deaa5745de3f5231ce10517a1f5dd97d53e5a2fd77aa6b5842292085831d48d7" -dependencies = [ - "once_cell", - "target-lexicon", -] - -[[package]] -name = "pyo3-ffi" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b42531d03e08d4ef1f6e85a2ed422eb678b8cd62b762e53891c05faf0d4afa" -dependencies = [ - "libc", - "pyo3-build-config", -] - -[[package]] -name = "pyo3-macros" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7305c720fa01b8055ec95e484a6eca7a83c841267f0dd5280f0c8b8551d2c158" -dependencies = [ - "proc-macro2", - "pyo3-macros-backend", - "quote", - "syn", -] - -[[package]] -name = "pyo3-macros-backend" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c7e9b68bb9c3149c5b0cade5d07f953d6d125eb4337723c4ccdb665f1f96185" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "pyo3-build-config", - "quote", - "syn", -] - [[package]] name = "quote" version = "1.0.35" @@ -1125,15 +934,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "redox_syscall" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "regex-automata" version = "0.4.6" @@ -1161,19 +961,13 @@ version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ - "bitflags 2.5.0", + "bitflags", "errno", "libc", "linux-raw-sys", "windows-sys", ] -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - [[package]] name = "semver" version = "1.0.22" @@ -1212,17 +1006,6 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" -[[package]] -name = "simplelog" -version = "0.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16257adbfaef1ee58b1363bdc0664c9b8e1e30aed86049635fb5f147d065a9c0" -dependencies = [ - "log", - "termcolor", - "time", -] - [[package]] name = "slab" version = "0.4.9" @@ -1232,12 +1015,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "smallvec" -version = "1.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" - [[package]] name = "snap" version = "1.1.1" @@ -1276,12 +1053,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "target-lexicon" -version = "0.12.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" - [[package]] name = "tempfile" version = "3.10.1" @@ -1294,15 +1065,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "termcolor" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" -dependencies = [ - "winapi-util", -] - [[package]] name = "termtree" version = "0.4.1" @@ -1329,51 +1091,12 @@ dependencies = [ "syn", ] -[[package]] -name = "time" -version = "0.3.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" -dependencies = [ - "deranged", - "itoa", - "libc", - "num-conv", - "num_threads", - "powerfmt", - "serde", - "time-core", - "time-macros", -] - -[[package]] -name = "time-core" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" - -[[package]] -name = "time-macros" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774" -dependencies = [ - "num-conv", - "time-core", -] - [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" -[[package]] -name = "unindent" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" - [[package]] name = "utf8parse" version = "0.2.1" @@ -1384,29 +1107,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" name = "vcf2parquet" version = "0.6.0" dependencies = [ + "arrow2", "assert_cmd", - "tempfile", - "vcf2parquet-bin", - "vcf2parquet-lib", -] - -[[package]] -name = "vcf2parquet-bin" -version = "0.6.0" -dependencies = [ - "arrow2 0.18.0", "clap", - "niffler", - "simplelog", - "thiserror", - "vcf2parquet-lib", -] - -[[package]] -name = "vcf2parquet-lib" -version = "0.6.0" -dependencies = [ - "arrow2 0.18.0", "lazy_static", "log", "niffler", @@ -1417,18 +1120,6 @@ dependencies = [ "thiserror", ] -[[package]] -name = "vcf2parquet-py" -version = "0.6.0" -dependencies = [ - "arrow2 0.17.4", - "niffler", - "pyo3", - "tempfile", - "thiserror", - "vcf2parquet-lib", -] - [[package]] name = "version_check" version = "0.9.4" @@ -1504,59 +1195,13 @@ version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" -dependencies = [ - "winapi", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", + "windows-targets", ] [[package]] @@ -1565,93 +1210,51 @@ version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - [[package]] name = "windows_aarch64_msvc" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - [[package]] name = "windows_i686_gnu" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - [[package]] name = "windows_i686_msvc" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - [[package]] name = "windows_x86_64_gnu" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - [[package]] name = "windows_x86_64_msvc" version = "0.52.4" diff --git a/Cargo.toml b/Cargo.toml index f6de8ad..4b63f00 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,32 +13,30 @@ keywords = ["bioinformatics", "parquet"] [dependencies] -vcf2parquet-lib = { path = "vcf2parquet-lib", version = "0.6.0", optional = true } -vcf2parquet-bin = { path = "vcf2parquet-bin", version = "0.6.0", optional = true } +# parallel +rayon = { version = "1" } -[dev-dependencies] -tempfile = { version = "3" } -assert_cmd = { version = "2" } - - -[workspace] -members = ["vcf2parquet-lib", "vcf2parquet-bin", "vcf2parquet-py"] +# input output management +niffler = { version = "2" } +noodles = { version = "0.64", features = ["vcf"] } +arrow2 = { version = "0.18", features = ["io_parquet", "io_parquet_compression"] } +rustc-hash = { version = "1" } +# logging management +log = { version = "0.4" } -[features] -default = ["lib"] +# error management +thiserror = { version = "1" } -lib = ["vcf2parquet-lib"] -bin = ["vcf2parquet-bin"] +# cli management +clap = { version = "4", features = ["derive"] } -[[bin]] -name = "vcf2parquet" -required-features = ["bin"] +[dev-dependencies] +lazy_static = { version = "1" } +tempfile = { version = "3" } +assert_cmd = { version = "2" } -[[test]] -name = "functional" -required-features = ["bin"] [package.metadata.docs.rs] all-features = true diff --git a/vcf2parquet-py/Cargo.toml b/python/Cargo.toml similarity index 100% rename from vcf2parquet-py/Cargo.toml rename to python/Cargo.toml diff --git a/vcf2parquet-py/Readme.md b/python/Readme.md similarity index 100% rename from vcf2parquet-py/Readme.md rename to python/Readme.md diff --git a/vcf2parquet-py/pyproject.toml b/python/pyproject.toml similarity index 100% rename from vcf2parquet-py/pyproject.toml rename to python/pyproject.toml diff --git a/vcf2parquet-py/src/error.rs b/python/src/error.rs similarity index 100% rename from vcf2parquet-py/src/error.rs rename to python/src/error.rs diff --git a/vcf2parquet-py/src/lib.rs b/python/src/lib.rs similarity index 100% rename from vcf2parquet-py/src/lib.rs rename to python/src/lib.rs diff --git a/vcf2parquet-py/tests/test_vcf2parquet.py b/python/tests/test_vcf2parquet.py similarity index 100% rename from vcf2parquet-py/tests/test_vcf2parquet.py rename to python/tests/test_vcf2parquet.py diff --git a/vcf2parquet-bin/src/cli.rs b/src/cli.rs similarity index 95% rename from vcf2parquet-bin/src/cli.rs rename to src/cli.rs index bbb15e2..db4e958 100644 --- a/vcf2parquet-bin/src/cli.rs +++ b/src/cli.rs @@ -6,17 +6,32 @@ /* project use */ +/// Compression available for user #[derive(Debug, clap::ValueEnum, Clone, Copy)] pub enum Compression { + /// No compression Uncompressed, + + /// Snappy compression Snappy, + + /// Gzip compression Gzip, + + /// Lzo compression Lzo, + + /// Brotly compression Brotli, + + /// Lz4 compression Lz4, + + /// Zstd compression Zstd, } +/// Define cli of vcf2parquet #[derive(clap::Parser, std::fmt::Debug)] #[command( name = "vcf2parquet", @@ -49,9 +64,13 @@ pub struct Command { subcommand: SubCommand, } +/// Enum to manage sub command #[derive(clap::Parser, std::fmt::Debug, Clone)] pub enum SubCommand { + /// Convert a vcf in a parquet Convert(Convert), + + /// Convert a vcf in multiple parquet file each file contains `batch_size` record Split(Split), } diff --git a/vcf2parquet-lib/src/error.rs b/src/error.rs similarity index 89% rename from vcf2parquet-lib/src/error.rs rename to src/error.rs index 9182b43..dde4087 100644 --- a/vcf2parquet-lib/src/error.rs +++ b/src/error.rs @@ -28,6 +28,10 @@ pub enum Error { /// Noodles header vcf error #[error(transparent)] NoodlesHeader(#[from] noodles::vcf::header::ParseError), + + /// Niffler error + #[error(transparent)] + Niffler(#[from] niffler::Error), } pub type Result = std::result::Result; diff --git a/src/lib.rs b/src/lib.rs index c1ddeb6..98e38c9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,329 @@ -#![warn(missing_docs)] +//! vcf2parquet library -//! vcf2parquet allow user to convert a vcf in parquet format. +#[warn(missing_docs)] +/* std use */ -pub use vcf2parquet_lib::*; +/* crate use */ + +/* project use */ + +/* mod section */ +pub mod cli; +pub mod error; +pub mod name2data; +pub mod record2chunk; +pub mod schema; + +/// Read `input` vcf and write parquet in `output` +pub fn vcf2parquet( + input: &mut R, + output: &mut W, + batch_size: usize, + compression: arrow2::io::parquet::write::CompressionOptions, + info_optional: bool, +) -> error::Result<()> +where + R: std::io::BufRead, + W: std::io::Write, +{ + // VCF section + let mut reader = noodles::vcf::Reader::new(input); + + let vcf_header: noodles::vcf::Header = reader.read_header()?; + + // Parquet section + let schema = schema::from_header(&vcf_header, info_optional)?; + + let mut iterator = reader.records(&vcf_header); + let chunk_iterator = record2chunk::Record2Chunk::new( + &mut iterator, + batch_size, + vcf_header.clone(), + schema.clone(), + ); + + let options = arrow2::io::parquet::write::WriteOptions { + write_statistics: true, + compression, + version: arrow2::io::parquet::write::Version::V2, + data_pagesize_limit: Some(batch_size), + }; + + let encodings = chunk_iterator.encodings(); + let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( + chunk_iterator, + &schema, + options, + encodings, + )?; + + let mut writer = arrow2::io::parquet::write::FileWriter::try_new(output, schema, options)?; + + for group in row_groups { + writer.write(group?)?; + } + let _ = writer.end(None)?; + + Ok(()) +} + +/// Read `input` vcf and write each row group in a parquet file match with template +pub fn vcf2multiparquet( + input: &mut R, + template: &str, + batch_size: usize, + compression: arrow2::io::parquet::write::CompressionOptions, + info_optional: bool, +) -> error::Result<()> +where + R: std::io::BufRead, +{ + // VCF section + let mut reader = noodles::vcf::Reader::new(input); + + let vcf_header: noodles::vcf::Header = reader.read_header()?; + + // Parquet section + let schema = schema::from_header(&vcf_header, info_optional)?; + + let mut iterator = reader.records(&vcf_header); + let chunk_iterator = record2chunk::Record2Chunk::new( + &mut iterator, + batch_size, + vcf_header.clone(), + schema.clone(), + ); + + let options = arrow2::io::parquet::write::WriteOptions { + write_statistics: true, + compression, + version: arrow2::io::parquet::write::Version::V2, + data_pagesize_limit: Some(batch_size), + }; + + let encodings = chunk_iterator.encodings(); + let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( + chunk_iterator, + &schema, + options, + encodings, + )?; + + for (index, group) in row_groups.enumerate() { + let output = std::fs::File::create(template.replace("{}", &index.to_string()))?; + let mut writer = + arrow2::io::parquet::write::FileWriter::try_new(output, schema.clone(), options)?; + + writer.write(group?)?; + writer.end(None)?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + static VCF_FILE: &[u8] = b"##fileformat=VCFv4.3 +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO +1\t925952\t1019397\tG\tA\t.\t.\t. +"; + + static PARQUET_FILE: &[u8] = &[ + 80, 65, 82, 49, 21, 6, 21, 10, 21, 50, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, + 28, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, + 96, 48, 4, 0, 151, 222, 156, 170, 5, 0, 0, 0, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, + 114, 111, 109, 111, 115, 111, 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, + 40, 1, 49, 24, 1, 49, 0, 0, 21, 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, + 21, 0, 17, 28, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 0, 31, 139, 8, 0, 0, + 0, 0, 0, 0, 255, 99, 80, 228, 99, 0, 0, 69, 222, 72, 134, 4, 0, 0, 0, 21, 2, 25, 37, 0, 6, + 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, + 1, 60, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 21, 6, 21, 30, 21, 70, 92, + 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, + 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 0, 3, 0, 3, 1, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, + 99, 103, 96, 96, 48, 52, 48, 180, 52, 182, 52, 7, 0, 69, 88, 164, 201, 11, 0, 0, 0, 21, 12, + 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 4, 108, 105, + 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, 160, 3, 60, 54, 0, 40, 7, 49, + 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 21, 6, 21, 10, 21, 50, 92, + 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 0, 31, + 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 7, 0, 158, 10, 250, 19, 5, 0, 0, 0, + 21, 12, 25, 37, 0, 6, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, + 22, 74, 22, 114, 38, 186, 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 21, 6, 21, 10, 21, 50, + 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 0, + 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 4, 0, 171, 175, 153, 250, 5, 0, 0, + 0, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 4, 22, + 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 21, 6, 21, 4, 21, + 44, 92, 21, 2, 21, 2, 21, 2, 21, 0, 21, 4, 21, 0, 17, 28, 54, 2, 0, 0, 0, 3, 0, 31, 139, 8, + 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, + 117, 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 21, + 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 0, 0, 0, 3, + 0, 3, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 12, 25, 37, + 0, 6, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, + 116, 101, 114, 21, 4, 22, 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 21, 12, 25, 5, + 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, + 38, 0, 0, 21, 2, 25, 5, 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, + 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, + 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, + 25, 5, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, + 38, 0, 0, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 0, 22, + 0, 22, 0, 22, 0, 38, 0, 0, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, + 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, + 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, + 0, 25, 17, 2, 25, 24, 1, 49, 25, 24, 1, 49, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 4, 0, + 33, 14, 0, 25, 24, 4, 0, 33, 14, 0, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 7, 49, 48, 49, + 57, 51, 57, 55, 25, 24, 7, 49, 48, 49, 57, 51, 57, 55, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, + 24, 1, 71, 25, 24, 1, 71, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 65, 25, 24, 1, 65, 21, + 0, 25, 22, 0, 0, 25, 17, 1, 25, 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 2, 0, 25, 17, 1, 25, + 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 0, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, + 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, + 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, + 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 28, 22, 8, 21, 114, 22, 0, 0, 0, 25, 28, 22, + 202, 1, 21, 124, 22, 0, 0, 0, 25, 28, 22, 160, 3, 21, 158, 1, 22, 0, 0, 0, 25, 28, 22, 186, + 5, 21, 114, 22, 0, 0, 0, 25, 28, 22, 252, 6, 21, 114, 22, 0, 0, 0, 25, 28, 22, 190, 8, 21, + 96, 22, 0, 0, 0, 25, 28, 22, 222, 9, 21, 100, 22, 0, 0, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, + 25, 12, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, 21, 4, 25, 204, 72, 4, 114, 111, 111, 116, 21, + 14, 0, 21, 12, 37, 0, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 37, 0, 76, + 28, 0, 0, 0, 21, 2, 37, 0, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 0, 53, 0, 24, 10, + 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, + 4, 108, 105, 115, 116, 21, 2, 0, 21, 12, 37, 0, 24, 2, 105, 100, 37, 0, 76, 28, 0, 0, 0, + 21, 12, 37, 0, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 37, 0, 76, 28, 0, 0, 0, + 21, 12, 37, 0, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 37, 0, 76, 28, 0, 0, 0, + 21, 8, 37, 2, 24, 7, 113, 117, 97, 108, 105, 116, 121, 0, 53, 0, 24, 6, 102, 105, 108, 116, + 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, 4, 108, 105, 115, 116, 21, 2, 0, 21, + 12, 37, 0, 24, 6, 102, 105, 108, 116, 101, 114, 37, 0, 76, 28, 0, 0, 0, 22, 2, 25, 44, 25, + 124, 38, 122, 28, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, + 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 22, + 222, 17, 21, 20, 22, 178, 14, 21, 34, 0, 38, 198, 2, 28, 21, 2, 25, 37, 0, 6, 25, 24, 8, + 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, 1, 60, 54, + 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 22, 242, 17, 21, 22, 22, 212, 14, 21, + 46, 0, 38, 190, 4, 28, 21, 12, 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, + 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, + 160, 3, 60, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, + 0, 22, 136, 18, 21, 24, 22, 130, 15, 21, 58, 0, 38, 172, 6, 28, 21, 12, 25, 37, 0, 6, 25, + 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 186, + 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 22, 160, 18, 21, 22, 22, 188, 15, 21, 34, 0, 38, + 238, 7, 28, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, + 4, 22, 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 22, 182, 18, + 21, 22, 22, 222, 15, 21, 34, 0, 38, 158, 9, 28, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, 117, + 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 22, 204, + 18, 21, 22, 22, 128, 16, 21, 34, 0, 38, 194, 10, 28, 21, 12, 25, 37, 0, 6, 25, 56, 6, 102, + 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 4, 22, + 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 22, 226, 18, 21, 22, 22, 162, 16, 21, 34, + 0, 22, 156, 4, 22, 2, 38, 8, 22, 180, 6, 20, 0, 0, 25, 124, 38, 0, 28, 21, 12, 25, 5, 25, + 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, + 0, 22, 248, 18, 21, 6, 22, 196, 16, 21, 22, 0, 38, 0, 28, 21, 2, 25, 5, 25, 24, 8, 112, + 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 254, 18, 21, + 6, 22, 218, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, + 105, 102, 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, + 38, 0, 0, 22, 132, 19, 21, 6, 22, 240, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, + 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 138, + 19, 21, 6, 22, 134, 17, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, + 114, 110, 97, 116, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 144, 19, 21, 6, 22, 156, + 17, 21, 22, 0, 38, 0, 28, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, 0, + 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 150, 19, 21, 6, 22, 178, 17, 21, 22, 0, 38, 0, 28, 21, + 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, + 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 156, 19, 21, 6, 22, 200, 17, + 21, 22, 0, 22, 0, 22, 0, 38, 0, 22, 0, 20, 2, 0, 25, 28, 24, 12, 65, 82, 82, 79, 87, 58, + 115, 99, 104, 101, 109, 97, 24, 244, 6, 47, 47, 47, 47, 47, 52, 56, 67, 65, 65, 65, 69, 65, + 65, 65, 65, 56, 118, 47, 47, 47, 120, 81, 65, 65, 65, 65, 69, 65, 65, 69, 65, 65, 65, 65, + 75, 65, 65, 115, 65, 67, 65, 65, 75, 65, 65, 81, 65, 43, 80, 47, 47, 47, 119, 119, 65, 65, + 65, 65, 73, 65, 65, 103, 65, 65, 65, 65, 69, 65, 65, 99, 65, 65, 65, 65, 103, 65, 103, 65, + 65, 48, 65, 69, 65, 65, 70, 65, 66, 65, 65, 65, 77, 65, 81, 65, 65, 121, 65, 65, 65, 65, + 73, 81, 65, 65, 65, 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 50, 119, 65, 65, 65, 66, + 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, + 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 81, 65, 65, 65, 65, 81, + 65, 65, 65, 68, 115, 47, 47, 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, + 65, 65, 65, 66, 81, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, + 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, + 65, 71, 65, 65, 65, 65, 90, 109, 108, 115, 100, 71, 86, 121, 65, 65, 68, 56, 47, 47, 47, + 47, 66, 65, 65, 69, 65, 65, 89, 65, 65, 65, 66, 109, 97, 87, 120, 48, 90, 88, 73, 65, 65, + 79, 122, 47, 47, 47, 56, 119, 65, 65, 65, 65, 73, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, + 66, 65, 119, 65, 65, 69, 65, 65, 83, 65, 65, 81, 65, 69, 65, 65, 82, 65, 65, 103, 65, 65, + 65, 65, 77, 65, 65, 65, 65, 65, 65, 68, 54, 47, 47, 47, 47, 65, 81, 65, 71, 65, 65, 89, 65, + 66, 65, 65, 72, 65, 65, 65, 65, 99, 88, 86, 104, 98, 71, 108, 48, 101, 81, 68, 115, 47, 47, + 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, 65, 65, 66, 81, 65, 65, 65, + 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, 65, 65, 65, 65, 68, 65, 65, 65, + 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, 65, 74, 65, 65, 65, 65, 89, 87, + 120, 48, 90, 88, 74, 117, 89, 88, 82, 108, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, + 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, + 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, + 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 81, 65, 65, 65, 72, 74, 108, 90, + 109, 86, 121, 90, 87, 53, 106, 90, 81, 65, 65, 65, 79, 122, 47, 47, 47, 57, 111, 65, 65, + 65, 65, 88, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, 77, 65, 65, 65, 65, 69, 65, 65, 82, + 65, 65, 81, 65, 65, 65, 65, 81, 65, 65, 103, 65, 65, 65, 65, 77, 65, 65, 69, 65, 65, 65, + 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, 65, 65, 65, 65, 103, 65, 65, 65, 65, + 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, + 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, + 65, 81, 65, 65, 103, 65, 65, 65, 71, 108, 107, 65, 65, 68, 56, 47, 47, 47, 47, 66, 65, 65, + 69, 65, 65, 111, 65, 65, 65, 66, 112, 90, 71, 86, 117, 100, 71, 108, 109, 97, 87, 86, 121, + 65, 65, 68, 115, 47, 47, 47, 47, 79, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, + 65, 65, 65, 103, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, + 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 57, 80, 47, 47, 47, 121, 65, 65, 65, 65, + 65, 66, 65, 65, 65, 65, 67, 65, 65, 74, 65, 65, 81, 65, 67, 65, 65, 73, 65, 65, 65, 65, 99, + 71, 57, 122, 97, 88, 82, 112, 98, 50, 52, 65, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, + 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, + 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, + 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 103, 65, 65, 65, 71, 78, 111, 99, + 109, 57, 116, 98, 51, 78, 118, 98, 87, 85, 65, 0, 24, 44, 65, 114, 114, 111, 119, 50, 32, + 45, 32, 78, 97, 116, 105, 118, 101, 32, 82, 117, 115, 116, 32, 105, 109, 112, 108, 101, + 109, 101, 110, 116, 97, 116, 105, 111, 110, 32, 111, 102, 32, 65, 114, 114, 111, 119, 0, + 107, 7, 0, 0, 80, 65, 82, 49, + ]; + + #[test] + fn convert_positives() { + let mut input = std::io::BufReader::new(&*VCF_FILE); + let mut output = Vec::new(); + + vcf2parquet( + &mut input, + &mut output, + 1, + arrow2::io::parquet::write::CompressionOptions::Gzip(None), + false, + ) + .unwrap(); + assert_eq!(output, *PARQUET_FILE); + } + + #[test] + fn not_a_vcf() { + let raw_data = [b'#', b'a', b'b', b'c', 255, 0x7F, b'\n'].to_vec(); + let mut input = std::io::BufReader::new(&raw_data[..]); + let mut output = Vec::new(); + + let result = vcf2parquet( + &mut input, + &mut output, + 1, + arrow2::io::parquet::write::CompressionOptions::Gzip(None), + false, + ); + + assert!(result.is_err()); + } + + #[test] + fn multi_positives() { + let mut input = std::io::BufReader::new(&*VCF_FILE); + let dir = tempfile::tempdir().unwrap(); + + let format = dir + .path() + .join("test_{}.parquet") + .as_os_str() + .to_str() + .unwrap() + .to_string(); + + vcf2multiparquet( + &mut input, + &format, + 1, + arrow2::io::parquet::write::CompressionOptions::Gzip(None), + false, + ) + .unwrap(); + } +} diff --git a/src/main.rs b/src/main.rs index 5d6b6e8..efe6894 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,57 @@ -#[cfg(feature = "bin")] -pub use vcf2parquet_bin::{error, main as bin_main}; +//! vcf2parquet bin + +/* std use */ + +/* crate use */ +use clap::Parser as _; + +/* project use */ +use vcf2parquet::cli; +use vcf2parquet::error; + +/* mod section */ pub fn main() -> error::Result<()> { - bin_main() + let params = cli::Command::parse(); + + match params.subcommand() { + cli::SubCommand::Convert(subparams) => convert(¶ms, subparams), + cli::SubCommand::Split(subparams) => split(¶ms, subparams), + } +} + +fn convert(params: &cli::Command, subparams: &cli::Convert) -> error::Result<()> { + let mut reader = std::fs::File::open(params.input()) + .map(Box::new) + .map(|x| niffler::get_reader(x))? + .map(|(file, _)| std::io::BufReader::with_capacity(params.read_buffer(), file))?; + + let mut output = std::fs::File::create(subparams.output())?; + + vcf2parquet::vcf2parquet( + &mut reader, + &mut output, + params.batch_size(), + params.compression(), + params.info_optional(), + )?; + + Ok(()) +} + +fn split(params: &cli::Command, subparams: &cli::Split) -> error::Result<()> { + let mut reader = std::fs::File::open(params.input()) + .map(Box::new) + .map(|x| niffler::get_reader(x))? + .map(|(file, _)| std::io::BufReader::with_capacity(params.read_buffer(), file))?; + + vcf2parquet::vcf2multiparquet( + &mut reader, + subparams.format(), + params.batch_size(), + params.compression(), + params.info_optional(), + )?; + + Ok(()) } diff --git a/vcf2parquet-lib/src/name2data.rs b/src/name2data.rs similarity index 100% rename from vcf2parquet-lib/src/name2data.rs rename to src/name2data.rs diff --git a/vcf2parquet-lib/src/record2chunk.rs b/src/record2chunk.rs similarity index 100% rename from vcf2parquet-lib/src/record2chunk.rs rename to src/record2chunk.rs diff --git a/vcf2parquet-lib/src/schema.rs b/src/schema.rs similarity index 100% rename from vcf2parquet-lib/src/schema.rs rename to src/schema.rs diff --git a/tests/functional.rs b/tests/functional.rs index 1f0024f..6114244 100644 --- a/tests/functional.rs +++ b/tests/functional.rs @@ -14,7 +14,27 @@ fn help() -> Result<(), assert_cmd::cargo::CargoError> { cmd.args(["-h"]); - let truth: &[u8] = b"Convert a vcf in parquet + let truth: &[u8] = if cfg!(windows) { + b"Convert a vcf in parquet + +Usage: vcf2parquet.exe [OPTIONS] --input + +Commands: + convert Convert a vcf in a parquet + split Convert a vcf in multiple parquet file each file contains `batch_size` record + help Print this message or the help of the given subcommand(s) + +Options: + -i, --input Input path + -b, --batch-size Batch size (default 100,000) + -c, --compression Compression method (default snappy) [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd] + -r, --read-buffer Read buffer size in bytes (default 8192) + -I, --info-optional All information fields are optional + -h, --help Print help (see more with '--help') + -V, --version Print version +" + } else { + b"Convert a vcf in parquet Usage: vcf2parquet [OPTIONS] --input @@ -29,9 +49,10 @@ Options: -c, --compression Compression method (default snappy) [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd] -r, --read-buffer Read buffer size in bytes (default 8192) -I, --info-optional All information fields are optional - -h, --help Print help + -h, --help Print help (see more with '--help') -V, --version Print version -"; +" + }; let assert = cmd.assert(); diff --git a/vcf2parquet-bin/Cargo.toml b/vcf2parquet-bin/Cargo.toml deleted file mode 100644 index f9f92ec..0000000 --- a/vcf2parquet-bin/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "vcf2parquet-bin" -version = "0.6.0" -edition = "2021" - -[dependencies] -vcf2parquet-lib = { path = "../vcf2parquet-lib", version = "0.6.0" } -niffler = { version = "2" } -arrow2 = { version = "0.18", features = ["io_parquet", "io_parquet_compression"] } - -# logging management -simplelog = { version = "0.12" } - -# error management -thiserror = { version = "1" } - -# cli management -clap = { version = "4", features = ["derive"] } diff --git a/vcf2parquet-bin/src/error.rs b/vcf2parquet-bin/src/error.rs deleted file mode 100644 index 921c1ae..0000000 --- a/vcf2parquet-bin/src/error.rs +++ /dev/null @@ -1,80 +0,0 @@ -//! error of vcf2parquet-bin - -/* std use */ - -/* crate use */ - -/* project use */ - -#[derive(thiserror::Error, std::fmt::Debug)] -pub enum Error { - /// Io error - #[error(transparent)] - Io { error: std::io::Error }, - - /// Niffler error - #[error(transparent)] - Niffler { error: niffler::Error }, - - /// vcf2parquet-lib error - #[error(transparent)] - Lib { - error: vcf2parquet_lib::error::Error, - }, -} - -pub fn mapping(error: E) -> Error -where - E: std::convert::Into, -{ - error.into() -} - -impl From for Error { - fn from(error: std::io::Error) -> Self { - Error::Io { error } - } -} - -impl From for Error { - fn from(error: niffler::Error) -> Self { - Error::Niffler { error } - } -} - -impl From for Error { - fn from(error: vcf2parquet_lib::error::Error) -> Self { - Error::Lib { error } - } -} - -pub type Result = std::result::Result; - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn error_conversion() { - assert_eq!( - format!( - "{:?}", - Error::from(std::io::Error::new(std::io::ErrorKind::NotFound, "test")) - ), - "Io { error: Custom { kind: NotFound, error: \"test\" } }".to_string() - ); - - assert_eq!( - format!("{:?}", Error::from(niffler::Error::FileTooShort)), - "Niffler { error: FileTooShort }".to_string() - ); - - assert_eq!( - format!( - "{:?}", - Error::from(vcf2parquet_lib::error::Error::NoConversion) - ), - "Lib { error: NoConversion }".to_string() - ); - } -} diff --git a/vcf2parquet-bin/src/lib.rs b/vcf2parquet-bin/src/lib.rs deleted file mode 100644 index 8d49bf8..0000000 --- a/vcf2parquet-bin/src/lib.rs +++ /dev/null @@ -1,64 +0,0 @@ -//! vcf2parquet bin - -/* std use */ - -/* crate use */ -use clap::Parser as _; - -/* project use */ -use vcf2parquet_lib as lib; - -/* mod section */ -pub mod cli; -pub mod error; - -pub fn main() -> error::Result<()> { - let params = cli::Command::parse(); - - match params.subcommand() { - cli::SubCommand::Convert(subparams) => convert(¶ms, subparams), - cli::SubCommand::Split(subparams) => split(¶ms, subparams), - } -} - -fn convert(params: &cli::Command, subparams: &cli::Convert) -> error::Result<()> { - let mut reader = std::fs::File::open(params.input()) - .map_err(error::mapping) - .map(Box::new) - .map(|x| niffler::get_reader(x)) - .map_err(error::mapping)? - .map(|(file, _)| std::io::BufReader::with_capacity(params.read_buffer(), file))?; - - let mut output = std::fs::File::create(subparams.output()).map_err(error::mapping)?; - - lib::vcf2parquet( - &mut reader, - &mut output, - params.batch_size(), - params.compression(), - params.info_optional(), - ) - .map_err(error::mapping)?; - - Ok(()) -} - -fn split(params: &cli::Command, subparams: &cli::Split) -> error::Result<()> { - let mut reader = std::fs::File::open(params.input()) - .map_err(error::mapping) - .map(Box::new) - .map(|x| niffler::get_reader(x)) - .map_err(error::mapping)? - .map(|(file, _)| std::io::BufReader::with_capacity(params.read_buffer(), file))?; - - lib::vcf2multiparquet( - &mut reader, - subparams.format(), - params.batch_size(), - params.compression(), - params.info_optional(), - ) - .map_err(error::mapping)?; - - Ok(()) -} diff --git a/vcf2parquet-lib/Cargo.toml b/vcf2parquet-lib/Cargo.toml deleted file mode 100644 index 221788c..0000000 --- a/vcf2parquet-lib/Cargo.toml +++ /dev/null @@ -1,21 +0,0 @@ -[package] -name = "vcf2parquet-lib" -version = "0.6.0" -edition = "2021" - -[dependencies] -rayon = { version = "1" } - -# input output management -niffler = { version = "2" } -noodles = { version = "0.64", features = ["vcf"] } -arrow2 = { version = "0.18", features = ["io_parquet", "io_parquet_compression"] } -rustc-hash = { version = "1" } - -# logging and error management -log = { version = "0.4" } -thiserror = { version = "1" } - -[dev-dependencies] -lazy_static = { version = "1" } -tempfile = { version = "3" } \ No newline at end of file diff --git a/vcf2parquet-lib/src/lib.rs b/vcf2parquet-lib/src/lib.rs deleted file mode 100644 index e89c1f6..0000000 --- a/vcf2parquet-lib/src/lib.rs +++ /dev/null @@ -1,327 +0,0 @@ -//! vcf2parquet library - -/* std use */ - -/* crate use */ - -/* project use */ - -/* mod section */ -pub mod error; -pub mod name2data; -pub mod record2chunk; -pub mod schema; - -/// Read `input` vcf and write parquet in `output` -pub fn vcf2parquet( - input: &mut R, - output: &mut W, - batch_size: usize, - compression: arrow2::io::parquet::write::CompressionOptions, - info_optional: bool, -) -> error::Result<()> -where - R: std::io::BufRead, - W: std::io::Write, -{ - // VCF section - let mut reader = noodles::vcf::Reader::new(input); - - let vcf_header: noodles::vcf::Header = reader.read_header()?; - - // Parquet section - let schema = schema::from_header(&vcf_header, info_optional)?; - - let mut iterator = reader.records(&vcf_header); - let chunk_iterator = record2chunk::Record2Chunk::new( - &mut iterator, - batch_size, - vcf_header.clone(), - schema.clone(), - ); - - let options = arrow2::io::parquet::write::WriteOptions { - write_statistics: true, - compression, - version: arrow2::io::parquet::write::Version::V2, - data_pagesize_limit: Some(batch_size), - }; - - let encodings = chunk_iterator.encodings(); - let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( - chunk_iterator, - &schema, - options, - encodings, - )?; - - let mut writer = arrow2::io::parquet::write::FileWriter::try_new(output, schema, options)?; - - for group in row_groups { - writer.write(group?)?; - } - let _ = writer.end(None)?; - - Ok(()) -} - -/// Read `input` vcf and write each row group in a parquet file match with template -pub fn vcf2multiparquet( - input: &mut R, - template: &str, - batch_size: usize, - compression: arrow2::io::parquet::write::CompressionOptions, - info_optional: bool, -) -> error::Result<()> -where - R: std::io::BufRead, -{ - // VCF section - let mut reader = noodles::vcf::Reader::new(input); - - let vcf_header: noodles::vcf::Header = reader.read_header()?; - - // Parquet section - let schema = schema::from_header(&vcf_header, info_optional)?; - - let mut iterator = reader.records(&vcf_header); - let chunk_iterator = record2chunk::Record2Chunk::new( - &mut iterator, - batch_size, - vcf_header.clone(), - schema.clone(), - ); - - let options = arrow2::io::parquet::write::WriteOptions { - write_statistics: true, - compression, - version: arrow2::io::parquet::write::Version::V2, - data_pagesize_limit: Some(batch_size), - }; - - let encodings = chunk_iterator.encodings(); - let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( - chunk_iterator, - &schema, - options, - encodings, - )?; - - for (index, group) in row_groups.enumerate() { - let output = std::fs::File::create(template.replace("{}", &index.to_string()))?; - let mut writer = - arrow2::io::parquet::write::FileWriter::try_new(output, schema.clone(), options)?; - - writer.write(group?)?; - writer.end(None)?; - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - - static VCF_FILE: &[u8] = b"##fileformat=VCFv4.3 -#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO -1\t925952\t1019397\tG\tA\t.\t.\t. -"; - - static PARQUET_FILE: &[u8] = &[ - 80, 65, 82, 49, 21, 6, 21, 10, 21, 50, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, - 28, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, - 96, 48, 4, 0, 151, 222, 156, 170, 5, 0, 0, 0, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, - 114, 111, 109, 111, 115, 111, 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, - 40, 1, 49, 24, 1, 49, 0, 0, 21, 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, - 21, 0, 17, 28, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 0, 31, 139, 8, 0, 0, - 0, 0, 0, 0, 255, 99, 80, 228, 99, 0, 0, 69, 222, 72, 134, 4, 0, 0, 0, 21, 2, 25, 37, 0, 6, - 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, - 1, 60, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 21, 6, 21, 30, 21, 70, 92, - 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, - 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 0, 3, 0, 3, 1, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, - 99, 103, 96, 96, 48, 52, 48, 180, 52, 182, 52, 7, 0, 69, 88, 164, 201, 11, 0, 0, 0, 21, 12, - 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 4, 108, 105, - 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, 160, 3, 60, 54, 0, 40, 7, 49, - 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 21, 6, 21, 10, 21, 50, 92, - 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 0, 31, - 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 7, 0, 158, 10, 250, 19, 5, 0, 0, 0, - 21, 12, 25, 37, 0, 6, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, - 22, 74, 22, 114, 38, 186, 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 21, 6, 21, 10, 21, 50, - 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 0, - 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 4, 0, 171, 175, 153, 250, 5, 0, 0, - 0, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 4, 22, - 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 21, 6, 21, 4, 21, - 44, 92, 21, 2, 21, 2, 21, 2, 21, 0, 21, 4, 21, 0, 17, 28, 54, 2, 0, 0, 0, 3, 0, 31, 139, 8, - 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, - 117, 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 21, - 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 0, 0, 0, 3, - 0, 3, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 12, 25, 37, - 0, 6, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, - 116, 101, 114, 21, 4, 22, 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 21, 12, 25, 5, - 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 21, 2, 25, 5, 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, - 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, - 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, - 25, 5, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 0, 22, - 0, 22, 0, 22, 0, 38, 0, 0, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, - 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, - 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, - 0, 25, 17, 2, 25, 24, 1, 49, 25, 24, 1, 49, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 4, 0, - 33, 14, 0, 25, 24, 4, 0, 33, 14, 0, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 7, 49, 48, 49, - 57, 51, 57, 55, 25, 24, 7, 49, 48, 49, 57, 51, 57, 55, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, - 24, 1, 71, 25, 24, 1, 71, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 65, 25, 24, 1, 65, 21, - 0, 25, 22, 0, 0, 25, 17, 1, 25, 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 2, 0, 25, 17, 1, 25, - 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 0, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, - 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, - 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, - 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 28, 22, 8, 21, 114, 22, 0, 0, 0, 25, 28, 22, - 202, 1, 21, 124, 22, 0, 0, 0, 25, 28, 22, 160, 3, 21, 158, 1, 22, 0, 0, 0, 25, 28, 22, 186, - 5, 21, 114, 22, 0, 0, 0, 25, 28, 22, 252, 6, 21, 114, 22, 0, 0, 0, 25, 28, 22, 190, 8, 21, - 96, 22, 0, 0, 0, 25, 28, 22, 222, 9, 21, 100, 22, 0, 0, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, - 25, 12, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, 21, 4, 25, 204, 72, 4, 114, 111, 111, 116, 21, - 14, 0, 21, 12, 37, 0, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 37, 0, 76, - 28, 0, 0, 0, 21, 2, 37, 0, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 0, 53, 0, 24, 10, - 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, - 4, 108, 105, 115, 116, 21, 2, 0, 21, 12, 37, 0, 24, 2, 105, 100, 37, 0, 76, 28, 0, 0, 0, - 21, 12, 37, 0, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 37, 0, 76, 28, 0, 0, 0, - 21, 12, 37, 0, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 37, 0, 76, 28, 0, 0, 0, - 21, 8, 37, 2, 24, 7, 113, 117, 97, 108, 105, 116, 121, 0, 53, 0, 24, 6, 102, 105, 108, 116, - 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, 4, 108, 105, 115, 116, 21, 2, 0, 21, - 12, 37, 0, 24, 6, 102, 105, 108, 116, 101, 114, 37, 0, 76, 28, 0, 0, 0, 22, 2, 25, 44, 25, - 124, 38, 122, 28, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, - 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 22, - 222, 17, 21, 20, 22, 178, 14, 21, 34, 0, 38, 198, 2, 28, 21, 2, 25, 37, 0, 6, 25, 24, 8, - 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, 1, 60, 54, - 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 22, 242, 17, 21, 22, 22, 212, 14, 21, - 46, 0, 38, 190, 4, 28, 21, 12, 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, - 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, - 160, 3, 60, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, - 0, 22, 136, 18, 21, 24, 22, 130, 15, 21, 58, 0, 38, 172, 6, 28, 21, 12, 25, 37, 0, 6, 25, - 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 186, - 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 22, 160, 18, 21, 22, 22, 188, 15, 21, 34, 0, 38, - 238, 7, 28, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, - 4, 22, 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 22, 182, 18, - 21, 22, 22, 222, 15, 21, 34, 0, 38, 158, 9, 28, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, 117, - 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 22, 204, - 18, 21, 22, 22, 128, 16, 21, 34, 0, 38, 194, 10, 28, 21, 12, 25, 37, 0, 6, 25, 56, 6, 102, - 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 4, 22, - 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 22, 226, 18, 21, 22, 22, 162, 16, 21, 34, - 0, 22, 156, 4, 22, 2, 38, 8, 22, 180, 6, 20, 0, 0, 25, 124, 38, 0, 28, 21, 12, 25, 5, 25, - 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, - 0, 22, 248, 18, 21, 6, 22, 196, 16, 21, 22, 0, 38, 0, 28, 21, 2, 25, 5, 25, 24, 8, 112, - 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 254, 18, 21, - 6, 22, 218, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, - 105, 102, 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 22, 132, 19, 21, 6, 22, 240, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, - 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 138, - 19, 21, 6, 22, 134, 17, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, - 114, 110, 97, 116, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 144, 19, 21, 6, 22, 156, - 17, 21, 22, 0, 38, 0, 28, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, 0, - 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 150, 19, 21, 6, 22, 178, 17, 21, 22, 0, 38, 0, 28, 21, - 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, - 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 156, 19, 21, 6, 22, 200, 17, - 21, 22, 0, 22, 0, 22, 0, 38, 0, 22, 0, 20, 2, 0, 25, 28, 24, 12, 65, 82, 82, 79, 87, 58, - 115, 99, 104, 101, 109, 97, 24, 244, 6, 47, 47, 47, 47, 47, 52, 56, 67, 65, 65, 65, 69, 65, - 65, 65, 65, 56, 118, 47, 47, 47, 120, 81, 65, 65, 65, 65, 69, 65, 65, 69, 65, 65, 65, 65, - 75, 65, 65, 115, 65, 67, 65, 65, 75, 65, 65, 81, 65, 43, 80, 47, 47, 47, 119, 119, 65, 65, - 65, 65, 73, 65, 65, 103, 65, 65, 65, 65, 69, 65, 65, 99, 65, 65, 65, 65, 103, 65, 103, 65, - 65, 48, 65, 69, 65, 65, 70, 65, 66, 65, 65, 65, 77, 65, 81, 65, 65, 121, 65, 65, 65, 65, - 73, 81, 65, 65, 65, 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 50, 119, 65, 65, 65, 66, - 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, - 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 81, 65, 65, 65, 65, 81, - 65, 65, 65, 68, 115, 47, 47, 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, - 65, 65, 65, 66, 81, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, - 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, - 65, 71, 65, 65, 65, 65, 90, 109, 108, 115, 100, 71, 86, 121, 65, 65, 68, 56, 47, 47, 47, - 47, 66, 65, 65, 69, 65, 65, 89, 65, 65, 65, 66, 109, 97, 87, 120, 48, 90, 88, 73, 65, 65, - 79, 122, 47, 47, 47, 56, 119, 65, 65, 65, 65, 73, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, - 66, 65, 119, 65, 65, 69, 65, 65, 83, 65, 65, 81, 65, 69, 65, 65, 82, 65, 65, 103, 65, 65, - 65, 65, 77, 65, 65, 65, 65, 65, 65, 68, 54, 47, 47, 47, 47, 65, 81, 65, 71, 65, 65, 89, 65, - 66, 65, 65, 72, 65, 65, 65, 65, 99, 88, 86, 104, 98, 71, 108, 48, 101, 81, 68, 115, 47, 47, - 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, 65, 65, 66, 81, 65, 65, 65, - 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, 65, 65, 65, 65, 68, 65, 65, 65, - 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, 65, 74, 65, 65, 65, 65, 89, 87, - 120, 48, 90, 88, 74, 117, 89, 88, 82, 108, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, - 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, - 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, - 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 81, 65, 65, 65, 72, 74, 108, 90, - 109, 86, 121, 90, 87, 53, 106, 90, 81, 65, 65, 65, 79, 122, 47, 47, 47, 57, 111, 65, 65, - 65, 65, 88, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, 77, 65, 65, 65, 65, 69, 65, 65, 82, - 65, 65, 81, 65, 65, 65, 65, 81, 65, 65, 103, 65, 65, 65, 65, 77, 65, 65, 69, 65, 65, 65, - 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, 65, 65, 65, 65, 103, 65, 65, 65, 65, - 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, - 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, - 65, 81, 65, 65, 103, 65, 65, 65, 71, 108, 107, 65, 65, 68, 56, 47, 47, 47, 47, 66, 65, 65, - 69, 65, 65, 111, 65, 65, 65, 66, 112, 90, 71, 86, 117, 100, 71, 108, 109, 97, 87, 86, 121, - 65, 65, 68, 115, 47, 47, 47, 47, 79, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, - 65, 65, 65, 103, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, - 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 57, 80, 47, 47, 47, 121, 65, 65, 65, 65, - 65, 66, 65, 65, 65, 65, 67, 65, 65, 74, 65, 65, 81, 65, 67, 65, 65, 73, 65, 65, 65, 65, 99, - 71, 57, 122, 97, 88, 82, 112, 98, 50, 52, 65, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, - 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, - 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, - 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 103, 65, 65, 65, 71, 78, 111, 99, - 109, 57, 116, 98, 51, 78, 118, 98, 87, 85, 65, 0, 24, 44, 65, 114, 114, 111, 119, 50, 32, - 45, 32, 78, 97, 116, 105, 118, 101, 32, 82, 117, 115, 116, 32, 105, 109, 112, 108, 101, - 109, 101, 110, 116, 97, 116, 105, 111, 110, 32, 111, 102, 32, 65, 114, 114, 111, 119, 0, - 107, 7, 0, 0, 80, 65, 82, 49, - ]; - - #[test] - fn convert_positives() { - let mut input = std::io::BufReader::new(&*VCF_FILE); - let mut output = Vec::new(); - - vcf2parquet( - &mut input, - &mut output, - 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), - false, - ) - .unwrap(); - assert_eq!(output, *PARQUET_FILE); - } - - #[test] - fn not_a_vcf() { - let raw_data = [b'#', b'a', b'b', b'c', 255, 0x7F, b'\n'].to_vec(); - let mut input = std::io::BufReader::new(&raw_data[..]); - let mut output = Vec::new(); - - let result = vcf2parquet( - &mut input, - &mut output, - 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), - false, - ); - - assert!(result.is_err()); - } - - #[test] - fn multi_positives() { - let mut input = std::io::BufReader::new(&*VCF_FILE); - let dir = tempfile::tempdir().unwrap(); - - let format = dir - .path() - .join("test_{}.parquet") - .as_os_str() - .to_str() - .unwrap() - .to_string(); - - vcf2multiparquet( - &mut input, - &format, - 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), - false, - ) - .unwrap(); - } -} From 92a4598cad2f5725270d496112a1e85d1a5ecc59 Mon Sep 17 00:00:00 2001 From: Charles Monod-Broca Date: Fri, 29 Mar 2024 17:33:28 +0100 Subject: [PATCH 3/8] Fixes 13 and 14 Tests working, only need to make python binding work --- src/name2data.rs | 163 +++++++++++++++++++++++++++++++++----------- src/record2chunk.rs | 18 ++++- 2 files changed, 137 insertions(+), 44 deletions(-) diff --git a/src/name2data.rs b/src/name2data.rs index 4d48c84..09bc2bc 100644 --- a/src/name2data.rs +++ b/src/name2data.rs @@ -2,6 +2,9 @@ /* std use */ +use arrow2::datatypes::Field; +use std::collections::HashMap; + /* crate use */ use arrow2::array::MutableArray; use arrow2::array::MutablePrimitiveArray; @@ -43,7 +46,7 @@ impl Name2Data { &mut self, record: noodles::vcf::Record, header: &noodles::vcf::Header, - schema: &arrow2::datatypes::Schema, + schema: &HashMap, ) -> std::result::Result<(), arrow2::error::Error> { let allele_count = record.alternate_bases().len() + 1; for (alt_id, allele) in record.alternate_bases().iter().enumerate() { @@ -77,7 +80,7 @@ impl Name2Data { &mut self, record: &noodles::vcf::Record, header: &noodles::vcf::Header, - schema: &arrow2::datatypes::Schema, + schema: &HashMap, alt_id: usize, allele_count: usize, ) -> std::result::Result<(), arrow2::error::Error> { @@ -352,7 +355,35 @@ impl Name2Data { } }, }, - None => column.push_null(), + None => { + if let Some(field) = schema.get(&key_name) { + match field.data_type { + arrow2::datatypes::DataType::FixedSizeList( + ref field_type, + fixed_size, + ) => match &field_type.data_type() { + arrow2::datatypes::DataType::Int32 => { + column.push_veci32(vec![Some(0); fixed_size])? + } + + arrow2::datatypes::DataType::Float32 => { + column.push_vecf32(vec![Some(0.); fixed_size])? + } + + arrow2::datatypes::DataType::Utf8 => column + .push_vecstring(vec![ + Some("".to_string()); + fixed_size + ])?, + + _ => column.push_null(), + }, + _ => column.push_null(), + } + } else { + unreachable!("{} should be in schema", key_name); + } + } }, None => { if info_def.ty() @@ -361,30 +392,32 @@ impl Name2Data { column.push_bool(Some(false)); } else { //Handle missing info field, only matters for FixedSizeList - for field in schema.fields.iter() { - if field.name == key_name { - match field.data_type { - arrow2::datatypes::DataType::FixedSizeList( - ref field_type, - fixed_size, - ) => match &field_type.data_type() { - arrow2::datatypes::DataType::Int32 => { - column.push_veci32(vec![None; fixed_size])? - } + if let Some(field) = schema.get(&key_name) { + match field.data_type { + arrow2::datatypes::DataType::FixedSizeList( + ref field_type, + fixed_size, + ) => match &field_type.data_type() { + arrow2::datatypes::DataType::Int32 => { + column.push_veci32(vec![Some(0); fixed_size])? + } - arrow2::datatypes::DataType::Float32 => { - column.push_vecf32(vec![None; fixed_size])? - } + arrow2::datatypes::DataType::Float32 => { + column.push_vecf32(vec![Some(0.); fixed_size])? + } - arrow2::datatypes::DataType::Utf8 => { - column.push_vecstring(vec![None; fixed_size])? - } + arrow2::datatypes::DataType::Utf8 => column + .push_vecstring(vec![ + Some("".to_string()); + fixed_size + ])?, - _ => column.push_null(), - }, - _ => column.push_null(), //Otherwise, just push null - } + _ => column.push_null(), + }, + _ => column.push_null(), //Otherwise, just push null } + } else { + unreachable!("Malformed VCF, {} should be in schema", key_name); } } } @@ -398,7 +431,7 @@ impl Name2Data { &mut self, record: &noodles::vcf::Record, header: &noodles::vcf::Header, - schema: &arrow2::datatypes::Schema, + schema: &HashMap, alt_id: usize, allele_count: usize, ) -> std::result::Result<(), arrow2::error::Error> { @@ -718,36 +751,77 @@ impl Name2Data { }, }, - None => column.push_null(), + None => { + if let Some(field) = schema.get(&key_name) { + match field.data_type { + arrow2::datatypes::DataType::FixedSizeList( + ref field_type, + fixed_size, + ) => match &field_type.data_type() { + arrow2::datatypes::DataType::Int32 => { + column.push_veci32(vec![Some(0); fixed_size])? + } + arrow2::datatypes::DataType::Float32 => { + column.push_vecf32(vec![Some(0.); fixed_size])? + } + arrow2::datatypes::DataType::Utf8 => column + .push_vecstring(vec![Some("".to_string()); fixed_size])?, + _ => column.push_null(), + }, + _ => column.push_null(), + } + } else { + unreachable!("{} should be in schema", key_name); + } + }, }, - None => column.push_null(), - } - } else { - //Handle missing format field, only matters for FixedSizeList - for field in schema.fields.iter() { - if field.name == key_name { + None => if let Some(field) = schema.get(&key_name) { match field.data_type { arrow2::datatypes::DataType::FixedSizeList( ref field_type, fixed_size, ) => match &field_type.data_type() { arrow2::datatypes::DataType::Int32 => { - column.push_veci32(vec![None; fixed_size])? + column.push_veci32(vec![Some(0); fixed_size])? } - arrow2::datatypes::DataType::Float32 => { - column.push_vecf32(vec![None; fixed_size])? - } - - arrow2::datatypes::DataType::Utf8 => { - column.push_vecstring(vec![None; fixed_size])? + column.push_vecf32(vec![Some(0.); fixed_size])? } - + arrow2::datatypes::DataType::Utf8 => column + .push_vecstring(vec![Some("".to_string()); fixed_size])?, _ => column.push_null(), }, _ => column.push_null(), } + } else { + unreachable!("{} should be in schema", key_name); + }, + } + } else { + //Handle missing format field, only matters for FixedSizeList + if let Some(field) = schema.get(&key_name) { + match field.data_type { + arrow2::datatypes::DataType::FixedSizeList( + ref field_type, + fixed_size, + ) => match &field_type.data_type() { + arrow2::datatypes::DataType::Int32 => { + column.push_veci32(vec![Some(0); fixed_size])? + } + + arrow2::datatypes::DataType::Float32 => { + column.push_vecf32(vec![Some(0.); fixed_size])? + } + + arrow2::datatypes::DataType::Utf8 => column + .push_vecstring(vec![Some("".to_string()); fixed_size])?, + + _ => column.push_null(), + }, + _ => column.push_null(), } + } else { + unreachable!("Malformed VCF, {} should be in schema", key_name); } } } @@ -1072,12 +1146,19 @@ chr2 300 . G A 70 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_R let header: noodles::vcf::Header = reader.read_header().unwrap(); let schema = schema::from_header(&header, false).unwrap(); + let schema_map: HashMap = schema + .fields + .iter() + .cloned() + .map(|f| (f.name.clone(), f)) + .collect(); + let mut data = Name2Data::new(10, &schema); let mut iterator = reader.records(&header); let record = iterator.next().unwrap().unwrap(); - data.add_record(record, &header, &schema).unwrap(); + data.add_record(record, &header, &schema_map).unwrap(); assert_eq!(format!("{:?}", data.get("alternate")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1]), values: [84] }, validity: None }))".to_string()); assert_eq!(format!("{:?}", data.get("chromosome")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 4]), values: [99, 104, 114, 49] }, validity: None }))".to_string()); @@ -1169,7 +1250,7 @@ chr2 300 . G A 70 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_R let record = iterator.next().unwrap().unwrap(); let mut data = Name2Data::new(10, &schema); - data.add_record(record, &header, &schema).unwrap(); + data.add_record(record, &header, &schema_map).unwrap(); assert_eq!(format!("{:?}", data.get("alternate")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1, 3]), values: [71, 67, 71] }, validity: None }))".to_string()); assert_eq!(format!("{:?}", data.get("filter")), "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 1, 2]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 4, 8]), values: [80, 65, 83, 83, 80, 65, 83, 83] }, validity: None }, validity: None }))".to_string()); diff --git a/src/record2chunk.rs b/src/record2chunk.rs index 133e7fb..85dcc39 100644 --- a/src/record2chunk.rs +++ b/src/record2chunk.rs @@ -4,6 +4,8 @@ /* crate use */ +use arrow2::datatypes::Field; + /* project use */ use crate::name2data::*; @@ -12,6 +14,7 @@ pub struct Record2Chunk { length: usize, header: noodles::vcf::Header, schema: arrow2::datatypes::Schema, + schema_map: std::collections::HashMap, end: bool, } @@ -25,13 +28,22 @@ where header: noodles::vcf::Header, schema: arrow2::datatypes::Schema, ) -> Self { - Self { + let mut res = Self { inner, length, header, schema, + schema_map: Default::default(), end: false, - } + }; + res.schema_map = res + .schema + .fields + .iter() + .cloned() + .map(|f| (f.name.clone(), f)) + .collect(); + res } pub fn encodings(&self) -> Vec> { @@ -66,7 +78,7 @@ where for _ in 0..self.length { match self.inner.next() { Some(Ok(record)) => { - if let Err(e) = name2data.add_record(record, &self.header, &self.schema) { + if let Err(e) = name2data.add_record(record, &self.header, &self.schema_map) { return Some(Err(e)); } } From 9caa8c9baef0cbded5e951b2d719afb104b40aea Mon Sep 17 00:00:00 2001 From: Pierre Marijon Date: Tue, 2 Apr 2024 12:45:39 +0200 Subject: [PATCH 4/8] fix: correct dependency in python binding --- python/Cargo.toml | 2 +- python/src/error.rs | 2 +- python/src/lib.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/Cargo.toml b/python/Cargo.toml index ffa4b6b..aa0c39d 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -9,7 +9,7 @@ name = "pyvcf2parquet" crate-type = ["cdylib"] [dependencies] -vcf2parquet-lib = { version = "0.6.0", path = "../vcf2parquet-lib" } +vcf2parquet = { version = "0.6.0", path = "../" } thiserror = "1" niffler = { version = "2" } arrow2 = { version = "0.17", features = ["io_parquet", "io_parquet_compression"] } diff --git a/python/src/error.rs b/python/src/error.rs index 621da37..c5312fc 100644 --- a/python/src/error.rs +++ b/python/src/error.rs @@ -11,7 +11,7 @@ use pyo3::{ }, }; use thiserror::Error; -use vcf2parquet_lib::error::Error as Vcf2ParquetError; +use vcf2parquet::error::Error as Vcf2ParquetError; #[derive(Error)] pub enum PyVcf2ParquetErr { diff --git a/python/src/lib.rs b/python/src/lib.rs index b1fd80b..175dc2a 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1,7 +1,7 @@ //! vcf2parquet python binding /* std use */ -use vcf2parquet_lib as lib; +use vcf2parquet as lib; /* crate use */ use pyo3::prelude::*; From 2d276928b7144bd65f81f538cc48a55719eff724 Mon Sep 17 00:00:00 2001 From: Charles Monod-Broca Date: Tue, 2 Apr 2024 14:09:16 +0200 Subject: [PATCH 5/8] Replace arrow2 by arrow: - improve test coverage - decorelate some code - support differente type au parquet version - remove support of boolean list --- .github/workflows/main.yaml | 4 +- Cargo.lock | 813 +++++++++++++++++--------- Cargo.toml | 3 +- python/Cargo.toml | 9 +- python/src/error.rs | 2 +- python/src/lib.rs | 38 +- src/{main.rs => bin/vcf2parquet.rs} | 4 +- src/cli.rs | 82 ++- src/columndata.rs | 220 ++++++++ src/error.rs | 6 +- src/lib.rs | 361 ++++++------ src/name2data.rs | 846 +++++++++++----------------- src/record2chunk.rs | 64 +-- src/schema.rs | 445 ++++++++------- tests/data/test.parquet | Bin 13171 -> 47127 bytes tests/data/test.vcf | 95 +++- tests/functional.rs | 48 +- 17 files changed, 1707 insertions(+), 1333 deletions(-) rename src/{main.rs => bin/vcf2parquet.rs} (93%) create mode 100644 src/columndata.rs diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index f95d5d2..39cd5fa 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -70,12 +70,12 @@ jobs: uses: actions/checkout@v2 - name: Generate code coverage - run: cargo +nightly tarpaulin --all-features --workspace --timeout 120 --out xml + run: cargo +nightly tarpaulin --all-features --workspace --engine llvm --timeout 120 --out xml - name: Upload to codecov.io uses: codecov/codecov-action@v2 with: - # token: ${{secrets.CODECOV_TOKEN}} # not required for public repos + token: ${{secrets.CODECOV_TOKEN}} # not required for public repos fail_ci_if_error: true lints: diff --git a/Cargo.lock b/Cargo.lock index 874476d..2a0aff6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15,12 +15,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", + "const-random", "getrandom", "once_cell", "version_check", "zerocopy", ] +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "alloc-no-stdlib" version = "2.0.4" @@ -36,6 +46,21 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anstream" version = "0.6.13" @@ -85,107 +110,213 @@ dependencies = [ ] [[package]] -name = "array-init-cursor" -version = "0.2.0" +name = "arrow" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" +checksum = "219d05930b81663fd3b32e3bde8ce5bff3c4d23052a99f11a8fa50a3b47b2658" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] [[package]] -name = "arrow-format" -version = "0.8.1" +name = "arrow-arith" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07884ea216994cdc32a2d5f8274a8bee979cfe90274b83f86f440866ee3132c7" +checksum = "0272150200c07a86a390be651abdd320a2d12e84535f0837566ca87ecd8f95e0" dependencies = [ - "planus", - "serde", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "num", ] [[package]] -name = "arrow2" -version = "0.18.0" +name = "arrow-array" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "963fef509b757bcbbf9e5ffa23bcb345614d99f4f6f531f97417b27b8604d389" +checksum = "8010572cf8c745e242d1b632bd97bd6d4f40fefed5ed1290a8f433abaa686fea" dependencies = [ "ahash", - "arrow-format", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d0a2432f0cba5692bf4cb757469c66791394bac9ec7ce63c1afe74744c37b27" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9abc10cd7995e83505cc290df9384d6e5412b207b79ce6bdff89a10505ed2cba" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", "base64", - "bytemuck", "chrono", - "dyn-clone", - "either", - "ethnum", - "fallible-streaming-iterator", - "foreign_vec", - "futures", - "getrandom", - "hash_hasher", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-data" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2742ac1f6650696ab08c88f6dd3f0eb68ce10f8c253958a18c943a68cd04aec5" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a42ea853130f7e78b9b9d178cb4cd01dee0f78e64d96c2949dc0a915d6d9e19d" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-ord" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3e6b61e3dc468f503181dccc2fc705bdcc5f2f146755fa5b56d0a6c5943f412" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "half", + "num", +] + +[[package]] +name = "arrow-row" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "848ee52bb92eb459b811fb471175ea3afcf620157674c8794f539838920f9228" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", "hashbrown", - "num-traits", - "parquet2", - "rustc_version", - "simdutf8", - "streaming-iterator", ] [[package]] -name = "assert_cmd" -version = "2.0.14" +name = "arrow-schema" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed72493ac66d5804837f480ab3766c72bdfab91a65e565fc54fa9e42db0073a8" +checksum = "02d9483aaabe910c4781153ae1b6ae0393f72d9ef757d38d09d450070cf2e528" + +[[package]] +name = "arrow-select" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "849524fa70e0e3c5ab58394c770cb8f514d0122d20de08475f7b472ed8075830" dependencies = [ - "anstyle", - "bstr", - "doc-comment", - "predicates", - "predicates-core", - "predicates-tree", - "wait-timeout", + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", ] [[package]] -name = "async-stream" -version = "0.3.5" +name = "arrow-string" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +checksum = "9373cb5a021aee58863498c37eb484998ef13377f69989c6c5ccfbd258236cdb" dependencies = [ - "async-stream-impl", - "futures-core", - "pin-project-lite", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", ] [[package]] -name = "async-stream-impl" -version = "0.3.5" +name = "assert_cmd" +version = "2.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +checksum = "ed72493ac66d5804837f480ab3766c72bdfab91a65e565fc54fa9e42db0073a8" dependencies = [ - "proc-macro2", - "quote", - "syn", + "anstyle", + "bstr", + "doc-comment", + "predicates", + "predicates-core", + "predicates-tree", + "wait-timeout", ] [[package]] -name = "async-trait" -version = "0.1.78" +name = "atoi" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "461abc97219de0eaaf81fe3ef974a540158f3d079c2ab200f891f1a2ef201e85" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" dependencies = [ - "proc-macro2", - "quote", - "syn", + "num-traits", ] [[package]] name = "autocfg" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" [[package]] name = "base64" -version = "0.21.7" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51" [[package]] name = "bgzip" @@ -203,6 +334,12 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.5.0" @@ -247,26 +384,6 @@ version = "3.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa" -[[package]] -name = "bytemuck" -version = "1.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15" -dependencies = [ - "bytemuck_derive", -] - -[[package]] -name = "bytemuck_derive" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4da9a32f3fed317401fa3c862968128267c3106685286e15d5aaa3d7389c2f60" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "byteorder" version = "1.5.0" @@ -318,11 +435,14 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.35" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eaf5903dcbc0a39312feb77df2ff4c76387d591b9fc7b04a238dcf8bb62639a" +checksum = "8a0d04d43504c61aa6c7531f1871dd0d418d91130162063b789da00fd7057a5e" dependencies = [ + "android-tzdata", + "iana-time-zone", "num-traits", + "windows-targets", ] [[package]] @@ -371,6 +491,32 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + [[package]] name = "crc32fast" version = "1.4.0" @@ -414,6 +560,12 @@ version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "difflib" version = "0.4.0" @@ -426,12 +578,6 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" -[[package]] -name = "dyn-clone" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" - [[package]] name = "either" version = "1.10.0" @@ -454,24 +600,22 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "ethnum" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b90ca2580b73ab6a1f724b76ca11ab632df820fd6040c336200d2c1df7b3c82c" - -[[package]] -name = "fallible-streaming-iterator" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" - [[package]] name = "fastrand" version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" +[[package]] +name = "flatbuffers" +version = "23.5.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" +dependencies = [ + "bitflags 1.3.2", + "rustc_version", +] + [[package]] name = "flate2" version = "1.0.28" @@ -483,167 +627,165 @@ dependencies = [ ] [[package]] -name = "foreign_vec" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" - -[[package]] -name = "futures" -version = "0.3.30" +name = "getrandom" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", + "cfg-if", + "libc", + "wasi", ] [[package]] -name = "futures-channel" -version = "0.3.30" +name = "half" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "b5eceaaeec696539ddaf7b333340f1af35a5aa87ae3e4f3ead0532f72affab2e" dependencies = [ - "futures-core", - "futures-sink", + "cfg-if", + "crunchy", + "num-traits", ] [[package]] -name = "futures-core" -version = "0.3.30" +name = "hashbrown" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" [[package]] -name = "futures-executor" -version = "0.3.30" +name = "heck" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] -name = "futures-io" -version = "0.3.30" +name = "iana-time-zone" +version = "0.1.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] [[package]] -name = "futures-macro" -version = "0.3.30" +name = "iana-time-zone-haiku" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" dependencies = [ - "proc-macro2", - "quote", - "syn", + "cc", ] [[package]] -name = "futures-sink" -version = "0.3.30" +name = "indexmap" +version = "2.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" +dependencies = [ + "equivalent", + "hashbrown", +] [[package]] -name = "futures-task" -version = "0.3.30" +name = "integer-encoding" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] -name = "futures-util" -version = "0.3.30" +name = "jobserver" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "slab", + "libc", ] [[package]] -name = "getrandom" -version = "0.2.12" +name = "js-sys" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ - "cfg-if", - "js-sys", - "libc", - "wasi", "wasm-bindgen", ] [[package]] -name = "hash_hasher" -version = "2.0.3" +name = "lazy_static" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] -name = "hashbrown" -version = "0.14.3" +name = "lexical-core" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" dependencies = [ - "ahash", + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", ] [[package]] -name = "heck" -version = "0.5.0" +name = "lexical-parse-float" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] [[package]] -name = "indexmap" -version = "2.2.5" +name = "lexical-parse-integer" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" dependencies = [ - "equivalent", - "hashbrown", + "lexical-util", + "static_assertions", ] [[package]] -name = "jobserver" -version = "0.1.28" +name = "lexical-util" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" dependencies = [ - "libc", + "static_assertions", ] [[package]] -name = "js-sys" -version = "0.3.69" +name = "lexical-write-float" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" dependencies = [ - "wasm-bindgen", + "lexical-util", + "lexical-write-integer", + "static_assertions", ] [[package]] -name = "lazy_static" -version = "1.4.0" +name = "lexical-write-integer" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +dependencies = [ + "lexical-util", + "static_assertions", +] [[package]] name = "libc" @@ -651,6 +793,12 @@ version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + [[package]] name = "linux-raw-sys" version = "0.4.13" @@ -664,23 +812,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] -name = "lz4" -version = "1.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e9e2dd86df36ce760a60f6ff6ad526f7ba1f14ba0356f8254fb6905e6494df1" -dependencies = [ - "libc", - "lz4-sys", -] - -[[package]] -name = "lz4-sys" -version = "1.9.4" +name = "lz4_flex" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" +checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" dependencies = [ - "cc", - "libc", + "twox-hash", ] [[package]] @@ -721,7 +858,7 @@ dependencies = [ "flate2", "thiserror", "xz2", - "zstd", + "zstd 0.12.4", ] [[package]] @@ -793,6 +930,72 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "num" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d869c01cc0c455284163fd0092f1f93835385ccab5a98a0dcc497b2f8bf055a9" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +dependencies = [ + "autocfg", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.18" @@ -800,6 +1003,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -809,50 +1013,57 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] -name = "parquet-format-safe" -version = "0.2.4" +name = "ordered-float" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1131c54b167dd4e4799ce762e1ab01549ebb94d5bdd13e6ec1b467491c378e1f" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" dependencies = [ - "async-trait", - "futures", + "num-traits", ] [[package]] -name = "parquet2" -version = "0.17.2" +name = "parquet" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "579fe5745f02cef3d5f236bfed216fd4693e49e4e920a13475c6132233283bce" +checksum = "096795d4f47f65fd3ee1ec5a98b77ab26d602f2cc785b0e4be5443add17ecc32" dependencies = [ - "async-stream", + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", "brotli", + "bytes", + "chrono", "flate2", - "futures", - "lz4", - "parquet-format-safe", + "half", + "hashbrown", + "lz4_flex", + "num", + "num-bigint", + "paste", "seq-macro", "snap", - "streaming-decompression", - "zstd", + "thrift", + "twox-hash", + "zstd 0.13.1", ] [[package]] -name = "percent-encoding" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" - -[[package]] -name = "pin-project-lite" -version = "0.2.13" +name = "paste" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] -name = "pin-utils" -version = "0.1.0" +name = "percent-encoding" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pkg-config" @@ -860,15 +1071,6 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" -[[package]] -name = "planus" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc1691dd09e82f428ce8d6310bd6d5da2557c82ff17694d2a32cad7242aea89f" -dependencies = [ - "array-init-cursor", -] - [[package]] name = "predicates" version = "3.1.0" @@ -934,11 +1136,34 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "regex" +version = "1.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + [[package]] name = "regex-automata" version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" [[package]] name = "rustc-hash" @@ -961,13 +1186,19 @@ version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ - "bitflags", + "bitflags 2.5.0", "errno", "libc", "linux-raw-sys", "windows-sys", ] +[[package]] +name = "ryu" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" + [[package]] name = "semver" version = "1.0.22" @@ -1000,21 +1231,6 @@ dependencies = [ "syn", ] -[[package]] -name = "simdutf8" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" - -[[package]] -name = "slab" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] - [[package]] name = "snap" version = "1.1.1" @@ -1022,19 +1238,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] -name = "streaming-decompression" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf6cc3b19bfb128a8ad11026086e31d3ce9ad23f8ea37354b31383a187c44cf3" -dependencies = [ - "fallible-streaming-iterator", -] - -[[package]] -name = "streaming-iterator" -version = "0.1.9" +name = "static_assertions" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "strsim" @@ -1091,6 +1298,36 @@ dependencies = [ "syn", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + [[package]] name = "unicode-ident" version = "1.0.12" @@ -1107,13 +1344,14 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" name = "vcf2parquet" version = "0.6.0" dependencies = [ - "arrow2", + "arrow", "assert_cmd", "clap", "lazy_static", "log", "niffler", "noodles", + "parquet", "rayon", "rustc-hash", "tempfile", @@ -1195,6 +1433,15 @@ version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -1296,7 +1543,16 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" dependencies = [ - "zstd-safe", + "zstd-safe 6.0.6", +] + +[[package]] +name = "zstd" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d789b1514203a1120ad2429eae43a7bd32b90976a7bb8a05f7ec02fa88cc23a" +dependencies = [ + "zstd-safe 7.1.0", ] [[package]] @@ -1309,11 +1565,20 @@ dependencies = [ "zstd-sys", ] +[[package]] +name = "zstd-safe" +version = "7.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cd99b45c6bc03a018c8b8a86025678c87e55526064e38f9df301989dce7ec0a" +dependencies = [ + "zstd-sys", +] + [[package]] name = "zstd-sys" -version = "2.0.9+zstd.1.5.5" +version = "2.0.10+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" +checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" dependencies = [ "cc", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index 4b63f00..9d440e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,8 @@ rayon = { version = "1" } # input output management niffler = { version = "2" } noodles = { version = "0.64", features = ["vcf"] } -arrow2 = { version = "0.18", features = ["io_parquet", "io_parquet_compression"] } +arrow = { version = "51", default-features = false } +parquet = { version = "51" } rustc-hash = { version = "1" } # logging management diff --git a/python/Cargo.toml b/python/Cargo.toml index ffa4b6b..2c26e88 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -9,9 +9,10 @@ name = "pyvcf2parquet" crate-type = ["cdylib"] [dependencies] -vcf2parquet-lib = { version = "0.6.0", path = "../vcf2parquet-lib" } -thiserror = "1" +vcf2parquet = { version = "0.6", path = "../../vcf2parquet" } +thiserror = { version = "1" } niffler = { version = "2" } -arrow2 = { version = "0.17", features = ["io_parquet", "io_parquet_compression"] } -tempfile = "3.10.0" +arrow = { version = "51", default-features = false } +parquet = { version = "51" } +tempfile = { version = "3.10.0" } pyo3 = { version = "0.20", features = ["extension-module"] } diff --git a/python/src/error.rs b/python/src/error.rs index 621da37..c5312fc 100644 --- a/python/src/error.rs +++ b/python/src/error.rs @@ -11,7 +11,7 @@ use pyo3::{ }, }; use thiserror::Error; -use vcf2parquet_lib::error::Error as Vcf2ParquetError; +use vcf2parquet::error::Error as Vcf2ParquetError; #[derive(Error)] pub enum PyVcf2ParquetErr { diff --git a/python/src/lib.rs b/python/src/lib.rs index b1fd80b..892a02f 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1,7 +1,7 @@ //! vcf2parquet python binding /* std use */ -use vcf2parquet_lib as lib; +use vcf2parquet as lib; /* crate use */ use pyo3::prelude::*; @@ -24,8 +24,15 @@ pub enum Compression { Zstd, } +#[pyclass] +#[derive(Debug, Clone, Copy)] +pub enum ParquetVersion { + V2_0, + V1_0, +} + #[pyfunction] -#[pyo3(signature = (input,output,read_buffer=8192,batch_size=100_000,compression=Compression::Snappy,info_optional=false))] +#[pyo3(signature = (input,output,read_buffer=8192,batch_size=100_000,compression=Compression::Snappy,info_optional=false,parquet_version=ParquetVersion::V2_0))] fn convert_vcf( input: std::path::PathBuf, output: std::path::PathBuf, @@ -33,6 +40,7 @@ fn convert_vcf( batch_size: usize, compression: Compression, info_optional: bool, + parquet_version: ParquetVersion, ) -> PyResult<()> { let mut reader = std::fs::File::open(input) .map(Box::new) @@ -43,13 +51,24 @@ fn convert_vcf( let mut output = std::fs::File::create(output)?; let compression = match compression { - Compression::Uncompressed => arrow2::io::parquet::write::CompressionOptions::Uncompressed, - Compression::Snappy => arrow2::io::parquet::write::CompressionOptions::Snappy, - Compression::Gzip => arrow2::io::parquet::write::CompressionOptions::Gzip(None), - Compression::Lzo => arrow2::io::parquet::write::CompressionOptions::Lzo, - Compression::Brotli => arrow2::io::parquet::write::CompressionOptions::Brotli(None), - Compression::Lz4 => arrow2::io::parquet::write::CompressionOptions::Lz4, - Compression::Zstd => arrow2::io::parquet::write::CompressionOptions::Zstd(None), + Compression::Uncompressed => parquet::basic::Compression::UNCOMPRESSED, + Compression::Snappy => parquet::basic::Compression::SNAPPY, + Compression::Gzip => { + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::default()) + } + Compression::Lzo => parquet::basic::Compression::LZO, + Compression::Brotli => { + parquet::basic::Compression::BROTLI(parquet::basic::BrotliLevel::default()) + } + Compression::Lz4 => parquet::basic::Compression::LZ4, + Compression::Zstd => { + parquet::basic::Compression::ZSTD(parquet::basic::ZstdLevel::default()) + } + }; + + let parquet_version = match parquet_version { + ParquetVersion::V2_0 => parquet::file::properties::WriterVersion::PARQUET_2_0, + ParquetVersion::V1_0 => parquet::file::properties::WriterVersion::PARQUET_1_0, }; lib::vcf2parquet( @@ -58,6 +77,7 @@ fn convert_vcf( batch_size, compression, info_optional, + parquet_version, ) .map_err(PyVcf2ParquetErr::from) .map_err(PyErr::from) diff --git a/src/main.rs b/src/bin/vcf2parquet.rs similarity index 93% rename from src/main.rs rename to src/bin/vcf2parquet.rs index efe6894..4cbed1b 100644 --- a/src/main.rs +++ b/src/bin/vcf2parquet.rs @@ -11,7 +11,7 @@ use vcf2parquet::error; /* mod section */ -pub fn main() -> error::Result<()> { +fn main() -> error::Result<()> { let params = cli::Command::parse(); match params.subcommand() { @@ -34,6 +34,7 @@ fn convert(params: &cli::Command, subparams: &cli::Convert) -> error::Result<()> params.batch_size(), params.compression(), params.info_optional(), + params.parquet_version(), )?; Ok(()) @@ -51,6 +52,7 @@ fn split(params: &cli::Command, subparams: &cli::Split) -> error::Result<()> { params.batch_size(), params.compression(), params.info_optional(), + params.parquet_version(), )?; Ok(()) diff --git a/src/cli.rs b/src/cli.rs index db4e958..8d1fd93 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -3,9 +3,19 @@ /* std use */ /* crate use */ +use parquet::file::properties::WriterVersion; /* project use */ +/// Parquet version available for user +#[derive(Debug, clap::ValueEnum, Clone, Copy)] +pub enum ParquetVersion { + ///Parquet version 1 + V1, + ///Parquet version 2 + V2, +} + /// Compression available for user #[derive(Debug, clap::ValueEnum, Clone, Copy)] pub enum Compression { @@ -60,6 +70,9 @@ pub struct Command { #[clap(short = 'I', long = "info-optional")] info_optional: bool, + #[clap(long = "parquet-version")] + parquet_version: Option, + #[clap(subcommand)] subcommand: SubCommand, } @@ -102,20 +115,31 @@ impl Command { } /// Get compression set by user or default value - pub fn compression(&self) -> arrow2::io::parquet::write::CompressionOptions { + pub fn compression(&self) -> parquet::basic::Compression { match self.compression { - Some(Compression::Uncompressed) => { - arrow2::io::parquet::write::CompressionOptions::Uncompressed + Some(Compression::Uncompressed) => parquet::basic::Compression::UNCOMPRESSED, + Some(Compression::Snappy) => parquet::basic::Compression::SNAPPY, + Some(Compression::Gzip) => { + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::default()) } - Some(Compression::Snappy) => arrow2::io::parquet::write::CompressionOptions::Snappy, - Some(Compression::Gzip) => arrow2::io::parquet::write::CompressionOptions::Gzip(None), - Some(Compression::Lzo) => arrow2::io::parquet::write::CompressionOptions::Lzo, + Some(Compression::Lzo) => parquet::basic::Compression::LZO, Some(Compression::Brotli) => { - arrow2::io::parquet::write::CompressionOptions::Brotli(None) + parquet::basic::Compression::BROTLI(parquet::basic::BrotliLevel::default()) + } + Some(Compression::Lz4) => parquet::basic::Compression::LZ4, + Some(Compression::Zstd) => { + parquet::basic::Compression::ZSTD(parquet::basic::ZstdLevel::default()) } - Some(Compression::Lz4) => arrow2::io::parquet::write::CompressionOptions::Lz4, - Some(Compression::Zstd) => arrow2::io::parquet::write::CompressionOptions::Zstd(None), - None => arrow2::io::parquet::write::CompressionOptions::Snappy, + None => parquet::basic::Compression::SNAPPY, + } + } + + /// Get parquet version + pub fn parquet_version(&self) -> WriterVersion { + match self.parquet_version { + Some(ParquetVersion::V1) => WriterVersion::PARQUET_1_0, + Some(ParquetVersion::V2) => WriterVersion::PARQUET_2_0, + None => WriterVersion::PARQUET_2_0, } } @@ -151,6 +175,7 @@ impl Split { #[cfg(test)] mod tests { + use super::*; #[test] @@ -164,6 +189,7 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; assert_eq!( @@ -191,10 +217,12 @@ mod tests { format: "test_{}.parquet".to_string(), }), info_optional: false, + parquet_version: Some(ParquetVersion::V1), }; assert_eq!(params.batch_size(), 100); assert_eq!(params.read_buffer(), 8194); + assert_eq!(params.parquet_version(), WriterVersion::PARQUET_1_0); match params.subcommand.clone() { SubCommand::Split(s) => assert_eq!(s.format(), "test_{}.parquet"), @@ -213,12 +241,11 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; - assert_eq!( - params.compression(), - arrow2::io::parquet::write::CompressionOptions::Snappy - ); + assert_eq!(params.compression(), parquet::basic::Compression::SNAPPY); + assert_eq!(params.parquet_version(), WriterVersion::PARQUET_2_0); params = Command { input: std::path::Path::new("test/input.vcf").to_path_buf(), @@ -229,11 +256,12 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; assert_eq!( params.compression(), - arrow2::io::parquet::write::CompressionOptions::Uncompressed + parquet::basic::Compression::UNCOMPRESSED ); params = Command { @@ -245,12 +273,10 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; - assert_eq!( - params.compression(), - arrow2::io::parquet::write::CompressionOptions::Snappy - ); + assert_eq!(params.compression(), parquet::basic::Compression::SNAPPY); params = Command { input: std::path::Path::new("test/input.vcf").to_path_buf(), @@ -261,11 +287,12 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; assert_eq!( params.compression(), - arrow2::io::parquet::write::CompressionOptions::Gzip(None) + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::default()) ); params = Command { @@ -277,12 +304,10 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; - assert_eq!( - params.compression(), - arrow2::io::parquet::write::CompressionOptions::Lzo - ); + assert_eq!(params.compression(), parquet::basic::Compression::LZO); params = Command { input: std::path::Path::new("test/input.vcf").to_path_buf(), @@ -293,11 +318,12 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; assert_eq!( params.compression(), - arrow2::io::parquet::write::CompressionOptions::Brotli(None) + parquet::basic::Compression::BROTLI(parquet::basic::BrotliLevel::default()) ); params = Command { @@ -309,11 +335,9 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; - assert_eq!( - params.compression(), - arrow2::io::parquet::write::CompressionOptions::Lz4 - ); + assert_eq!(params.compression(), parquet::basic::Compression::LZ4); } } diff --git a/src/columndata.rs b/src/columndata.rs new file mode 100644 index 0000000..9004210 --- /dev/null +++ b/src/columndata.rs @@ -0,0 +1,220 @@ +//! Struct to link name and data + +/* std use */ + +/* crate use */ +use arrow::array::ArrayBuilder; + +/* project use */ + +/// Stores arrow array builders for each column datatype +#[derive(Debug)] +pub enum ColumnData { + /// Boolean column + Bool(arrow::array::BooleanBuilder), + /// Int32 column + Int(arrow::array::Int32Builder), + /// Float32 column + Float(arrow::array::Float32Builder), + /// String column + String(arrow::array::StringBuilder), + + /// List of int32 column + ListInt(arrow::array::ListBuilder), + /// List of float32 column + ListFloat(arrow::array::ListBuilder), + /// List of string column + ListString(arrow::array::ListBuilder), +} + +impl ColumnData { + /// Creates a new ColumnData based on arrow type, length and field name + pub fn new( + arrow_type: &arrow::datatypes::DataType, + length: usize, + field_name: &str, + nullable: bool, + ) -> Self { + match arrow_type { + arrow::datatypes::DataType::Boolean => { + ColumnData::Bool(arrow::array::BooleanBuilder::with_capacity(length)) + } + arrow::datatypes::DataType::Int32 => { + ColumnData::Int(arrow::array::Int32Builder::with_capacity(length)) + } + arrow::datatypes::DataType::Float32 => { + ColumnData::Float(arrow::array::Float32Builder::with_capacity(length)) + } + arrow::datatypes::DataType::Utf8 => ColumnData::String( + arrow::array::StringBuilder::with_capacity(length, length * 10), + ), + arrow::datatypes::DataType::List(field) => match field.data_type() { + arrow::datatypes::DataType::Int32 => ColumnData::ListInt( + arrow::array::ListBuilder::with_capacity( + arrow::array::Int32Builder::new(), + length, + ) + .with_field(arrow::datatypes::Field::new( + field_name, + arrow::datatypes::DataType::Int32, + nullable, + )), + ), + arrow::datatypes::DataType::Float32 => ColumnData::ListFloat( + arrow::array::ListBuilder::with_capacity( + arrow::array::Float32Builder::new(), + length, + ) + .with_field(arrow::datatypes::Field::new( + field_name, + arrow::datatypes::DataType::Float32, + nullable, + )), + ), + arrow::datatypes::DataType::Utf8 => ColumnData::ListString( + arrow::array::ListBuilder::with_capacity( + arrow::array::StringBuilder::new(), + length, + ) + .with_field(arrow::datatypes::Field::new( + field_name, + arrow::datatypes::DataType::Utf8, + nullable, + )), + ), + _ => todo!(), + }, + dt => unreachable!("Unsupported arrow type, please check Schema: {:?}", dt), + } + } + + /// Add a Null value in array + pub fn push_null(&mut self) { + match self { + ColumnData::Bool(a) => a.append_null(), + ColumnData::Int(a) => a.append_null(), + ColumnData::Float(a) => a.append_null(), + ColumnData::String(a) => a.append_null(), + + ColumnData::ListInt(a) => a.append_null(), + ColumnData::ListFloat(a) => a.append_null(), + ColumnData::ListString(a) => a.append_null(), + } + } + + /// Get the length of internal array + pub fn len(&self) -> usize { + match self { + ColumnData::Bool(a) => a.len(), + ColumnData::Int(a) => a.len(), + ColumnData::Float(a) => a.len(), + ColumnData::String(a) => a.len(), + + ColumnData::ListInt(a) => a.len(), + ColumnData::ListFloat(a) => a.len(), + ColumnData::ListString(a) => a.len(), + } + } + + /// Check if array is empty (not used for now) + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Add a boolean value in array, if it's not a boolean array failled + pub fn push_bool(&mut self, value: bool) { + match self { + ColumnData::Bool(a) => a.append_value(value), + _ => todo!(), + } + } + + /// Add a i32 value in array, if it's not a integer array failled + pub fn push_i32(&mut self, value: Option) { + match self { + ColumnData::Int(a) => a.append_option(value), + _ => todo!(), + } + } + + /// Add a f32 value in array, if it's not a float array failled + pub fn push_f32(&mut self, value: Option) { + match self { + ColumnData::Float(a) => a.append_option(value), + _ => todo!(), + } + } + + /// Add a string value in array, if it's not a string array failled + pub fn push_string(&mut self, value: String) { + match self { + ColumnData::String(a) => a.append_option(Some(value)), + _ => todo!(), + } + } + + /// Add a vector of integer value in array, if it's not a vector of integer array failled + pub fn push_veci32(&mut self, value: Vec>) -> arrow::error::Result<()> { + match self { + ColumnData::ListInt(a) => { + a.values().append_values( + &value + .iter() + .map(|v| v.unwrap_or_default()) + .collect::>(), + &value.iter().map(|v| v.is_some()).collect::>(), + ); + a.append(true); + Ok(()) + } + _ => todo!(), + } + } + + /// Add a vector of float value in array, if it's not a vector of float array failled + pub fn push_vecf32(&mut self, value: Vec>) -> arrow::error::Result<()> { + match self { + ColumnData::ListFloat(a) => { + a.values().append_values( + &value + .iter() + .map(|v| v.unwrap_or_default()) + .collect::>(), + &value.iter().map(|v| v.is_some()).collect::>(), + ); + a.append(true); + Ok(()) + } + _ => todo!(), + } + } + + /// Add a vector of string value in array, if it's not a vector of string array failled + pub fn push_vecstring(&mut self, value: Vec>) -> arrow::error::Result<()> { + match self { + ColumnData::ListString(a) => { + for v in value { + a.values().append_option(v); + } + a.append(true); + Ok(()) + } + _ => todo!(), + } + } + + /// Convert ColumnData in Arrow2 array + pub fn into_arc(self) -> std::sync::Arc { + let length = self.len(); + + match self { + ColumnData::Bool(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::Int(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::Float(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::String(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::ListInt(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::ListFloat(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::ListString(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + } + } +} diff --git a/src/error.rs b/src/error.rs index dde4087..face054 100644 --- a/src/error.rs +++ b/src/error.rs @@ -6,6 +6,7 @@ /* project use */ +/// Error type #[non_exhaustive] #[derive(thiserror::Error, std::fmt::Debug)] pub enum Error { @@ -15,11 +16,11 @@ pub enum Error { /// Arrow error #[error(transparent)] - Arrow(#[from] arrow2::error::Error), + Arrow(#[from] arrow::error::ArrowError), /// Parquet error #[error(transparent)] - Parquet(#[from] arrow2::io::parquet::read::ParquetError), + Parquet(#[from] parquet::errors::ParquetError), /// Io error #[error(transparent)] @@ -34,4 +35,5 @@ pub enum Error { Niffler(#[from] niffler::Error), } +/// Result type pub type Result = std::result::Result; diff --git a/src/lib.rs b/src/lib.rs index 98e38c9..d429857 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,14 +1,15 @@ //! vcf2parquet library -#[warn(missing_docs)] +#![warn(missing_docs)] /* std use */ /* crate use */ - +use parquet::file::properties::WriterVersion; /* project use */ /* mod section */ pub mod cli; +pub mod columndata; pub mod error; pub mod name2data; pub mod record2chunk; @@ -19,12 +20,13 @@ pub fn vcf2parquet( input: &mut R, output: &mut W, batch_size: usize, - compression: arrow2::io::parquet::write::CompressionOptions, + compression: parquet::basic::Compression, info_optional: bool, + parquet_version: WriterVersion, ) -> error::Result<()> where R: std::io::BufRead, - W: std::io::Write, + W: std::io::Write + std::marker::Send, { // VCF section let mut reader = noodles::vcf::Reader::new(input); @@ -33,36 +35,33 @@ where // Parquet section let schema = schema::from_header(&vcf_header, info_optional)?; + let schema_ptr = std::sync::Arc::new(schema); let mut iterator = reader.records(&vcf_header); let chunk_iterator = record2chunk::Record2Chunk::new( &mut iterator, batch_size, vcf_header.clone(), - schema.clone(), + schema_ptr.clone(), ); - let options = arrow2::io::parquet::write::WriteOptions { - write_statistics: true, - compression, - version: arrow2::io::parquet::write::Version::V2, - data_pagesize_limit: Some(batch_size), - }; - - let encodings = chunk_iterator.encodings(); - let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( - chunk_iterator, - &schema, - options, - encodings, - )?; - - let mut writer = arrow2::io::parquet::write::FileWriter::try_new(output, schema, options)?; - - for group in row_groups { - writer.write(group?)?; + let options = parquet::file::properties::WriterProperties::builder() + .set_compression(compression) + .set_statistics_enabled(parquet::file::properties::EnabledStatistics::Page) + .set_writer_version(parquet_version) + .set_write_batch_size(batch_size) + .build(); + + let row_groups = arrow::array::RecordBatchIterator::new(chunk_iterator, schema_ptr.clone()); + + let mut writer = + parquet::arrow::ArrowWriter::try_new(output, schema_ptr.clone(), Some(options))?; + + for result in row_groups { + let group = result?; + writer.write(&group)?; } - let _ = writer.end(None)?; + let _ = writer.close()?; Ok(()) } @@ -72,8 +71,9 @@ pub fn vcf2multiparquet( input: &mut R, template: &str, batch_size: usize, - compression: arrow2::io::parquet::write::CompressionOptions, + compression: parquet::basic::Compression, info_optional: bool, + parquet_version: WriterVersion, ) -> error::Result<()> where R: std::io::BufRead, @@ -85,37 +85,36 @@ where // Parquet section let schema = schema::from_header(&vcf_header, info_optional)?; + let schema_ptr = std::sync::Arc::new(schema); let mut iterator = reader.records(&vcf_header); let chunk_iterator = record2chunk::Record2Chunk::new( &mut iterator, batch_size, vcf_header.clone(), - schema.clone(), + schema_ptr.clone(), ); - let options = arrow2::io::parquet::write::WriteOptions { - write_statistics: true, - compression, - version: arrow2::io::parquet::write::Version::V2, - data_pagesize_limit: Some(batch_size), - }; - - let encodings = chunk_iterator.encodings(); - let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( - chunk_iterator, - &schema, - options, - encodings, - )?; - - for (index, group) in row_groups.enumerate() { - let output = std::fs::File::create(template.replace("{}", &index.to_string()))?; - let mut writer = - arrow2::io::parquet::write::FileWriter::try_new(output, schema.clone(), options)?; + let options = parquet::file::properties::WriterProperties::builder() + .set_compression(compression) + .set_statistics_enabled(parquet::file::properties::EnabledStatistics::Page) + .set_writer_version(parquet_version) + .set_write_batch_size(batch_size) + .build(); + + let row_groups = arrow::array::RecordBatchIterator::new(chunk_iterator, schema_ptr.clone()); - writer.write(group?)?; - writer.end(None)?; + for (index, result) in row_groups.enumerate() { + let group = result?; + let output = std::fs::File::create(template.replace("{}", &index.to_string()))?; + let mut writer = parquet::arrow::ArrowWriter::try_new( + output, + schema_ptr.clone(), + Some(options.clone()), + )?; + + writer.write(&group)?; + writer.close()?; } Ok(()) @@ -131,144 +130,125 @@ mod tests { "; static PARQUET_FILE: &[u8] = &[ - 80, 65, 82, 49, 21, 6, 21, 10, 21, 50, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, - 28, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, - 96, 48, 4, 0, 151, 222, 156, 170, 5, 0, 0, 0, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, - 114, 111, 109, 111, 115, 111, 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, - 40, 1, 49, 24, 1, 49, 0, 0, 21, 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, - 21, 0, 17, 28, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 0, 31, 139, 8, 0, 0, - 0, 0, 0, 0, 255, 99, 80, 228, 99, 0, 0, 69, 222, 72, 134, 4, 0, 0, 0, 21, 2, 25, 37, 0, 6, - 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, - 1, 60, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 21, 6, 21, 30, 21, 70, 92, - 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, - 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 0, 3, 0, 3, 1, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, - 99, 103, 96, 96, 48, 52, 48, 180, 52, 182, 52, 7, 0, 69, 88, 164, 201, 11, 0, 0, 0, 21, 12, - 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 4, 108, 105, - 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, 160, 3, 60, 54, 0, 40, 7, 49, - 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 21, 6, 21, 10, 21, 50, 92, - 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 0, 31, - 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 7, 0, 158, 10, 250, 19, 5, 0, 0, 0, - 21, 12, 25, 37, 0, 6, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, - 22, 74, 22, 114, 38, 186, 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 21, 6, 21, 10, 21, 50, - 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 0, - 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 4, 0, 171, 175, 153, 250, 5, 0, 0, - 0, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 4, 22, - 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 21, 6, 21, 4, 21, - 44, 92, 21, 2, 21, 2, 21, 2, 21, 0, 21, 4, 21, 0, 17, 28, 54, 2, 0, 0, 0, 3, 0, 31, 139, 8, - 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, - 117, 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 21, - 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 0, 0, 0, 3, - 0, 3, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 12, 25, 37, - 0, 6, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, - 116, 101, 114, 21, 4, 22, 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 21, 12, 25, 5, - 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 21, 2, 25, 5, 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, - 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, - 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, - 25, 5, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 0, 22, - 0, 22, 0, 22, 0, 38, 0, 0, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, - 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, - 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, - 0, 25, 17, 2, 25, 24, 1, 49, 25, 24, 1, 49, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 4, 0, - 33, 14, 0, 25, 24, 4, 0, 33, 14, 0, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 7, 49, 48, 49, - 57, 51, 57, 55, 25, 24, 7, 49, 48, 49, 57, 51, 57, 55, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, - 24, 1, 71, 25, 24, 1, 71, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 65, 25, 24, 1, 65, 21, - 0, 25, 22, 0, 0, 25, 17, 1, 25, 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 2, 0, 25, 17, 1, 25, - 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 0, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, - 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, - 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, - 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 28, 22, 8, 21, 114, 22, 0, 0, 0, 25, 28, 22, - 202, 1, 21, 124, 22, 0, 0, 0, 25, 28, 22, 160, 3, 21, 158, 1, 22, 0, 0, 0, 25, 28, 22, 186, - 5, 21, 114, 22, 0, 0, 0, 25, 28, 22, 252, 6, 21, 114, 22, 0, 0, 0, 25, 28, 22, 190, 8, 21, - 96, 22, 0, 0, 0, 25, 28, 22, 222, 9, 21, 100, 22, 0, 0, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, - 25, 12, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, 21, 4, 25, 204, 72, 4, 114, 111, 111, 116, 21, - 14, 0, 21, 12, 37, 0, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 37, 0, 76, - 28, 0, 0, 0, 21, 2, 37, 0, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 0, 53, 0, 24, 10, - 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, - 4, 108, 105, 115, 116, 21, 2, 0, 21, 12, 37, 0, 24, 2, 105, 100, 37, 0, 76, 28, 0, 0, 0, - 21, 12, 37, 0, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 37, 0, 76, 28, 0, 0, 0, - 21, 12, 37, 0, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 37, 0, 76, 28, 0, 0, 0, - 21, 8, 37, 2, 24, 7, 113, 117, 97, 108, 105, 116, 121, 0, 53, 0, 24, 6, 102, 105, 108, 116, - 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, 4, 108, 105, 115, 116, 21, 2, 0, 21, - 12, 37, 0, 24, 6, 102, 105, 108, 116, 101, 114, 37, 0, 76, 28, 0, 0, 0, 22, 2, 25, 44, 25, - 124, 38, 122, 28, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, - 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 22, - 222, 17, 21, 20, 22, 178, 14, 21, 34, 0, 38, 198, 2, 28, 21, 2, 25, 37, 0, 6, 25, 24, 8, - 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, 1, 60, 54, - 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 22, 242, 17, 21, 22, 22, 212, 14, 21, - 46, 0, 38, 190, 4, 28, 21, 12, 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, - 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, - 160, 3, 60, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, - 0, 22, 136, 18, 21, 24, 22, 130, 15, 21, 58, 0, 38, 172, 6, 28, 21, 12, 25, 37, 0, 6, 25, - 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 186, - 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 22, 160, 18, 21, 22, 22, 188, 15, 21, 34, 0, 38, - 238, 7, 28, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, - 4, 22, 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 22, 182, 18, - 21, 22, 22, 222, 15, 21, 34, 0, 38, 158, 9, 28, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, 117, - 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 22, 204, - 18, 21, 22, 22, 128, 16, 21, 34, 0, 38, 194, 10, 28, 21, 12, 25, 37, 0, 6, 25, 56, 6, 102, - 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 4, 22, - 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 22, 226, 18, 21, 22, 22, 162, 16, 21, 34, - 0, 22, 156, 4, 22, 2, 38, 8, 22, 180, 6, 20, 0, 0, 25, 124, 38, 0, 28, 21, 12, 25, 5, 25, - 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, - 0, 22, 248, 18, 21, 6, 22, 196, 16, 21, 22, 0, 38, 0, 28, 21, 2, 25, 5, 25, 24, 8, 112, - 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 254, 18, 21, - 6, 22, 218, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, - 105, 102, 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 22, 132, 19, 21, 6, 22, 240, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, - 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 138, - 19, 21, 6, 22, 134, 17, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, - 114, 110, 97, 116, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 144, 19, 21, 6, 22, 156, - 17, 21, 22, 0, 38, 0, 28, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, 0, - 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 150, 19, 21, 6, 22, 178, 17, 21, 22, 0, 38, 0, 28, 21, - 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, - 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 156, 19, 21, 6, 22, 200, 17, - 21, 22, 0, 22, 0, 22, 0, 38, 0, 22, 0, 20, 2, 0, 25, 28, 24, 12, 65, 82, 82, 79, 87, 58, - 115, 99, 104, 101, 109, 97, 24, 244, 6, 47, 47, 47, 47, 47, 52, 56, 67, 65, 65, 65, 69, 65, - 65, 65, 65, 56, 118, 47, 47, 47, 120, 81, 65, 65, 65, 65, 69, 65, 65, 69, 65, 65, 65, 65, - 75, 65, 65, 115, 65, 67, 65, 65, 75, 65, 65, 81, 65, 43, 80, 47, 47, 47, 119, 119, 65, 65, - 65, 65, 73, 65, 65, 103, 65, 65, 65, 65, 69, 65, 65, 99, 65, 65, 65, 65, 103, 65, 103, 65, - 65, 48, 65, 69, 65, 65, 70, 65, 66, 65, 65, 65, 77, 65, 81, 65, 65, 121, 65, 65, 65, 65, - 73, 81, 65, 65, 65, 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 50, 119, 65, 65, 65, 66, - 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, - 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 81, 65, 65, 65, 65, 81, - 65, 65, 65, 68, 115, 47, 47, 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, - 65, 65, 65, 66, 81, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, - 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, - 65, 71, 65, 65, 65, 65, 90, 109, 108, 115, 100, 71, 86, 121, 65, 65, 68, 56, 47, 47, 47, - 47, 66, 65, 65, 69, 65, 65, 89, 65, 65, 65, 66, 109, 97, 87, 120, 48, 90, 88, 73, 65, 65, - 79, 122, 47, 47, 47, 56, 119, 65, 65, 65, 65, 73, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, - 66, 65, 119, 65, 65, 69, 65, 65, 83, 65, 65, 81, 65, 69, 65, 65, 82, 65, 65, 103, 65, 65, - 65, 65, 77, 65, 65, 65, 65, 65, 65, 68, 54, 47, 47, 47, 47, 65, 81, 65, 71, 65, 65, 89, 65, - 66, 65, 65, 72, 65, 65, 65, 65, 99, 88, 86, 104, 98, 71, 108, 48, 101, 81, 68, 115, 47, 47, - 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, 65, 65, 66, 81, 65, 65, 65, - 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, 65, 65, 65, 65, 68, 65, 65, 65, - 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, 65, 74, 65, 65, 65, 65, 89, 87, - 120, 48, 90, 88, 74, 117, 89, 88, 82, 108, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, - 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, - 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, - 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 81, 65, 65, 65, 72, 74, 108, 90, - 109, 86, 121, 90, 87, 53, 106, 90, 81, 65, 65, 65, 79, 122, 47, 47, 47, 57, 111, 65, 65, - 65, 65, 88, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, 77, 65, 65, 65, 65, 69, 65, 65, 82, - 65, 65, 81, 65, 65, 65, 65, 81, 65, 65, 103, 65, 65, 65, 65, 77, 65, 65, 69, 65, 65, 65, - 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, 65, 65, 65, 65, 103, 65, 65, 65, 65, - 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, - 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, - 65, 81, 65, 65, 103, 65, 65, 65, 71, 108, 107, 65, 65, 68, 56, 47, 47, 47, 47, 66, 65, 65, - 69, 65, 65, 111, 65, 65, 65, 66, 112, 90, 71, 86, 117, 100, 71, 108, 109, 97, 87, 86, 121, - 65, 65, 68, 115, 47, 47, 47, 47, 79, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, - 65, 65, 65, 103, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, - 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 57, 80, 47, 47, 47, 121, 65, 65, 65, 65, - 65, 66, 65, 65, 65, 65, 67, 65, 65, 74, 65, 65, 81, 65, 67, 65, 65, 73, 65, 65, 65, 65, 99, - 71, 57, 122, 97, 88, 82, 112, 98, 50, 52, 65, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, - 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, - 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, - 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 103, 65, 65, 65, 71, 78, 111, 99, - 109, 57, 116, 98, 51, 78, 118, 98, 87, 85, 65, 0, 24, 44, 65, 114, 114, 111, 119, 50, 32, - 45, 32, 78, 97, 116, 105, 118, 101, 32, 82, 117, 115, 116, 32, 105, 109, 112, 108, 101, - 109, 101, 110, 116, 97, 116, 105, 111, 110, 32, 111, 102, 32, 65, 114, 114, 111, 119, 0, - 107, 7, 0, 0, 80, 65, 82, 49, + 80, 65, 82, 49, 21, 4, 21, 10, 21, 50, 76, 21, 2, 21, 0, 18, 0, 0, 31, 139, 8, 0, 0, 0, 0, + 0, 0, 255, 99, 100, 96, 96, 48, 4, 0, 151, 222, 156, 170, 5, 0, 0, 0, 21, 6, 21, 4, 21, 44, + 92, 21, 2, 21, 0, 21, 2, 21, 16, 21, 0, 21, 0, 17, 28, 88, 1, 49, 24, 1, 49, 17, 17, 0, 0, + 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 96, 2, 0, 211, 115, 215, 175, 2, 0, 0, 0, 21, 12, + 25, 53, 0, 6, 16, 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 4, 22, + 2, 22, 106, 22, 186, 1, 38, 78, 38, 0, 28, 88, 1, 49, 24, 1, 49, 17, 17, 0, 0, 21, 4, 21, + 8, 21, 48, 76, 21, 2, 21, 0, 18, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 80, 228, 99, + 0, 0, 69, 222, 72, 134, 4, 0, 0, 0, 21, 6, 21, 4, 21, 44, 92, 21, 2, 21, 0, 21, 2, 21, 16, + 21, 0, 21, 0, 17, 28, 88, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 17, 17, 0, 0, 0, 31, 139, + 8, 0, 0, 0, 0, 0, 0, 255, 99, 96, 2, 0, 211, 115, 215, 175, 2, 0, 0, 0, 21, 2, 25, 53, 0, + 6, 16, 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 116, 22, 196, + 1, 38, 76, 38, 0, 28, 24, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 56, 4, 0, 33, 14, 0, 24, 4, + 0, 33, 14, 0, 17, 17, 0, 0, 21, 4, 21, 22, 21, 62, 76, 21, 2, 21, 0, 18, 0, 0, 31, 139, 8, + 0, 0, 0, 0, 0, 0, 255, 99, 103, 96, 96, 48, 52, 48, 180, 52, 182, 52, 7, 0, 69, 88, 164, + 201, 11, 0, 0, 0, 21, 6, 21, 12, 21, 52, 92, 21, 2, 21, 0, 21, 2, 21, 16, 21, 4, 21, 4, 17, + 28, 88, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 17, 17, 0, 0, 0, + 2, 0, 2, 1, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 96, 2, 0, 211, 115, 215, 175, 2, 0, 0, + 0, 21, 12, 25, 53, 0, 6, 16, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, + 4, 108, 105, 115, 116, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 4, 22, 2, + 22, 150, 1, 22, 230, 1, 38, 90, 38, 0, 28, 88, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, + 48, 49, 57, 51, 57, 55, 17, 17, 0, 0, 21, 4, 21, 10, 21, 50, 76, 21, 2, 21, 0, 18, 0, 0, + 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 7, 0, 158, 10, 250, 19, 5, 0, 0, + 0, 21, 6, 21, 4, 21, 44, 92, 21, 2, 21, 0, 21, 2, 21, 16, 21, 0, 21, 0, 17, 28, 88, 1, 71, + 24, 1, 71, 17, 17, 0, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 96, 2, 0, 211, 115, 215, + 175, 2, 0, 0, 0, 21, 12, 25, 53, 0, 6, 16, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, + 99, 101, 21, 4, 22, 2, 22, 106, 22, 186, 1, 38, 78, 38, 0, 28, 88, 1, 71, 24, 1, 71, 17, + 17, 0, 0, 21, 4, 21, 10, 21, 50, 76, 21, 2, 21, 0, 18, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, + 255, 99, 100, 96, 96, 112, 4, 0, 171, 175, 153, 250, 5, 0, 0, 0, 21, 6, 21, 4, 21, 44, 92, + 21, 2, 21, 0, 21, 2, 21, 16, 21, 0, 21, 0, 17, 28, 88, 1, 65, 24, 1, 65, 17, 17, 0, 0, 0, + 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 96, 2, 0, 211, 115, 215, 175, 2, 0, 0, 0, 21, 12, + 25, 53, 0, 6, 16, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 4, 22, 2, 22, + 106, 22, 186, 1, 38, 78, 38, 0, 28, 88, 1, 65, 24, 1, 65, 17, 17, 0, 0, 21, 4, 21, 0, 21, + 40, 76, 21, 0, 21, 0, 18, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 21, 6, 21, 6, 21, 46, 92, 21, 2, 21, 2, 21, 2, 21, 16, 21, 4, 21, 0, 17, 0, 0, 2, 0, + 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 0, 0, 141, 239, 2, 210, 1, 0, 0, 0, 21, 8, 25, 53, + 0, 6, 16, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 78, 22, 158, 1, + 38, 68, 38, 0, 28, 54, 2, 0, 0, 21, 4, 21, 0, 21, 40, 76, 21, 0, 21, 0, 18, 0, 0, 31, 139, + 8, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 6, 21, 10, 21, 50, 92, 21, 2, + 21, 2, 21, 2, 21, 16, 21, 4, 21, 4, 17, 0, 0, 2, 0, 2, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, + 255, 99, 0, 0, 141, 239, 2, 210, 1, 0, 0, 0, 21, 12, 25, 53, 0, 6, 16, 25, 56, 6, 102, 105, + 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 4, 22, 2, + 22, 82, 22, 162, 1, 38, 68, 38, 0, 28, 54, 2, 0, 0, 25, 17, 2, 25, 24, 1, 49, 25, 24, 1, + 49, 21, 2, 25, 22, 0, 0, 25, 17, 2, 25, 24, 4, 0, 33, 14, 0, 25, 24, 4, 0, 33, 14, 0, 21, + 2, 25, 22, 0, 0, 25, 17, 2, 25, 24, 7, 49, 48, 49, 57, 51, 57, 55, 25, 24, 7, 49, 48, 49, + 57, 51, 57, 55, 21, 2, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 71, 25, 24, 1, 71, 21, 2, 25, + 22, 0, 0, 25, 17, 2, 25, 24, 1, 65, 25, 24, 1, 65, 21, 2, 25, 22, 0, 0, 25, 17, 1, 25, 24, + 1, 0, 25, 24, 1, 0, 21, 2, 25, 22, 2, 0, 25, 17, 1, 25, 24, 1, 0, 25, 24, 1, 0, 21, 2, 25, + 22, 2, 0, 25, 28, 22, 86, 21, 108, 22, 0, 0, 0, 25, 28, 22, 230, 2, 21, 120, 22, 0, 0, 0, + 25, 28, 22, 176, 5, 21, 140, 1, 22, 0, 0, 0, 25, 28, 22, 156, 8, 21, 108, 22, 0, 0, 0, 25, + 28, 22, 172, 10, 21, 108, 22, 0, 0, 0, 25, 28, 22, 178, 12, 21, 90, 22, 0, 0, 0, 25, 28, + 22, 150, 14, 21, 94, 22, 0, 0, 0, 21, 4, 25, 204, 72, 12, 97, 114, 114, 111, 119, 95, 115, + 99, 104, 101, 109, 97, 21, 14, 0, 21, 12, 37, 0, 24, 10, 99, 104, 114, 111, 109, 111, 115, + 111, 109, 101, 37, 0, 76, 28, 0, 0, 0, 21, 2, 37, 0, 24, 8, 112, 111, 115, 105, 116, 105, + 111, 110, 0, 53, 0, 24, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 2, 21, 6, + 76, 60, 0, 0, 0, 53, 4, 24, 4, 108, 105, 115, 116, 21, 2, 0, 21, 12, 37, 0, 24, 10, 105, + 100, 101, 110, 116, 105, 102, 105, 101, 114, 37, 0, 76, 28, 0, 0, 0, 21, 12, 37, 0, 24, 9, + 114, 101, 102, 101, 114, 101, 110, 99, 101, 37, 0, 76, 28, 0, 0, 0, 21, 12, 37, 0, 24, 9, + 97, 108, 116, 101, 114, 110, 97, 116, 101, 37, 0, 76, 28, 0, 0, 0, 21, 8, 37, 2, 24, 7, + 113, 117, 97, 108, 105, 116, 121, 0, 53, 0, 24, 6, 102, 105, 108, 116, 101, 114, 21, 2, 21, + 6, 76, 60, 0, 0, 0, 53, 4, 24, 4, 108, 105, 115, 116, 21, 2, 0, 21, 12, 37, 0, 24, 6, 102, + 105, 108, 116, 101, 114, 37, 0, 76, 28, 0, 0, 0, 22, 2, 25, 28, 25, 124, 38, 194, 1, 28, + 21, 12, 25, 53, 0, 6, 16, 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, + 4, 22, 2, 22, 106, 22, 186, 1, 38, 86, 38, 8, 28, 88, 1, 49, 24, 1, 49, 17, 17, 0, 0, 22, + 226, 17, 21, 20, 22, 208, 15, 21, 34, 0, 38, 222, 3, 28, 21, 2, 25, 53, 0, 6, 16, 25, 24, + 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 116, 22, 196, 1, 38, 230, 2, + 38, 154, 2, 28, 24, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 56, 4, 0, 33, 14, 0, 24, 4, 0, + 33, 14, 0, 17, 17, 0, 0, 22, 246, 17, 21, 22, 22, 242, 15, 21, 46, 0, 38, 188, 6, 28, 21, + 12, 25, 53, 0, 6, 16, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 4, 108, + 105, 115, 116, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 4, 22, 2, 22, 150, + 1, 22, 230, 1, 38, 176, 5, 38, 214, 4, 28, 88, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, + 48, 49, 57, 51, 57, 55, 17, 17, 0, 0, 22, 140, 18, 21, 24, 22, 160, 16, 21, 58, 0, 38, 136, + 9, 28, 21, 12, 25, 53, 0, 6, 16, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, + 4, 22, 2, 22, 106, 22, 186, 1, 38, 156, 8, 38, 206, 7, 28, 88, 1, 71, 24, 1, 71, 17, 17, 0, + 0, 22, 164, 18, 21, 22, 22, 218, 16, 21, 34, 0, 38, 152, 11, 28, 21, 12, 25, 53, 0, 6, 16, + 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 4, 22, 2, 22, 106, 22, 186, 1, + 38, 172, 10, 38, 222, 9, 28, 88, 1, 65, 24, 1, 65, 17, 17, 0, 0, 22, 186, 18, 21, 22, 22, + 252, 16, 21, 34, 0, 38, 140, 13, 28, 21, 8, 25, 53, 0, 6, 16, 25, 24, 7, 113, 117, 97, 108, + 105, 116, 121, 21, 4, 22, 2, 22, 78, 22, 158, 1, 38, 178, 12, 38, 238, 11, 28, 54, 2, 0, 0, + 22, 208, 18, 21, 22, 22, 158, 17, 21, 34, 0, 38, 244, 14, 28, 21, 12, 25, 53, 0, 6, 16, 25, + 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, + 114, 21, 4, 22, 2, 22, 82, 22, 162, 1, 38, 150, 14, 38, 210, 13, 28, 54, 2, 0, 0, 22, 230, + 18, 21, 22, 22, 192, 17, 21, 34, 0, 22, 232, 5, 22, 2, 38, 8, 22, 152, 10, 20, 0, 0, 25, + 28, 24, 12, 65, 82, 82, 79, 87, 58, 115, 99, 104, 101, 109, 97, 24, 208, 5, 47, 47, 47, 47, + 47, 120, 81, 67, 65, 65, 65, 81, 65, 65, 65, 65, 65, 65, 65, 75, 65, 65, 119, 65, 67, 103, + 65, 74, 65, 65, 81, 65, 67, 103, 65, 65, 65, 66, 65, 65, 65, 65, 65, 65, 65, 81, 81, 65, + 67, 65, 65, 73, 65, 65, 65, 65, 66, 65, 65, 73, 65, 65, 65, 65, 66, 65, 65, 65, 65, 65, 99, + 65, 65, 65, 67, 48, 65, 81, 65, 65, 90, 65, 69, 65, 65, 65, 81, 66, 65, 65, 68, 85, 65, 65, + 65, 65, 112, 65, 65, 65, 65, 71, 119, 65, 65, 65, 65, 69, 65, 65, 65, 65, 101, 80, 55, 47, + 47, 120, 103, 65, 65, 65, 65, 77, 65, 65, 65, 65, 65, 65, 65, 65, 68, 68, 103, 65, 65, 65, + 65, 66, 65, 65, 65, 65, 67, 65, 65, 65, 65, 71, 122, 43, 47, 47, 43, 89, 47, 118, 47, 47, + 70, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 65, 65, 65, 65, 70, 68, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 67, 73, 47, 118, 47, 47, 66, 103, 65, 65, 65, 71, 90, 112, 98, 72, 82, 108, + 99, 103, 65, 65, 66, 103, 65, 65, 65, 71, 90, 112, 98, 72, 82, 108, 99, 103, 65, 65, 69, + 65, 65, 87, 65, 66, 65, 65, 68, 103, 65, 80, 65, 65, 81, 65, 65, 65, 65, 73, 65, 66, 65, + 65, 65, 65, 65, 89, 65, 65, 65, 65, 72, 65, 65, 65, 65, 65, 65, 65, 65, 81, 77, 89, 65, 65, + 65, 65, 65, 65, 65, 71, 65, 65, 103, 65, 66, 103, 65, 71, 65, 65, 65, 65, 65, 65, 65, 66, + 65, 65, 65, 65, 65, 65, 65, 72, 65, 65, 65, 65, 99, 88, 86, 104, 98, 71, 108, 48, 101, 81, + 65, 81, 47, 47, 47, 47, 70, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 65, 65, 65, 65, 70, + 68, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 47, 47, 47, 47, 67, 81, 65, 65, 65, 71, 70, + 115, 100, 71, 86, 121, 98, 109, 70, 48, 90, 81, 65, 65, 65, 68, 122, 47, 47, 47, 56, 85, + 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 65, 65, 85, 77, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 67, 122, 47, 47, 47, 56, 74, 65, 65, 65, 65, 99, 109, 86, 109, 90, 88, 74, 108, 98, + 109, 78, 108, 65, 65, 65, 65, 97, 80, 47, 47, 47, 120, 103, 65, 65, 65, 65, 77, 65, 65, 65, + 65, 65, 65, 65, 65, 68, 68, 119, 65, 65, 65, 65, 66, 65, 65, 65, 65, 67, 65, 65, 65, 65, + 70, 122, 47, 47, 47, 43, 73, 47, 47, 47, 47, 70, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, + 65, 65, 65, 65, 70, 68, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 52, 47, 47, 47, 47, 67, + 103, 65, 65, 65, 71, 108, 107, 90, 87, 53, 48, 97, 87, 90, 112, 90, 88, 73, 65, 65, 65, + 111, 65, 65, 65, 66, 112, 90, 71, 86, 117, 100, 71, 108, 109, 97, 87, 86, 121, 65, 65, 68, + 69, 47, 47, 47, 47, 71, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 65, 65, 65, 65, 67, 72, 65, + 65, 65, 65, 65, 103, 65, 68, 65, 65, 69, 65, 65, 115, 65, 67, 65, 65, 65, 65, 67, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 66, 65, 65, 65, 65, 65, 65, 103, 65, 65, 65, 66, 119, 98, 51, + 78, 112, 100, 71, 108, 118, 98, 103, 65, 65, 65, 65, 65, 81, 65, 66, 81, 65, 69, 65, 65, + 65, 65, 65, 56, 65, 66, 65, 65, 65, 65, 65, 103, 65, 69, 65, 65, 65, 65, 66, 103, 65, 65, + 65, 65, 77, 65, 65, 65, 65, 65, 65, 65, 65, 66, 82, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 66, 65, 65, 69, 65, 65, 81, 65, 65, 65, 65, 75, 65, 65, 65, 65, 89, 50, 104, 121, 98, 50, + 49, 118, 99, 50, 57, 116, 90, 81, 65, 65, 0, 24, 25, 112, 97, 114, 113, 117, 101, 116, 45, + 114, 115, 32, 118, 101, 114, 115, 105, 111, 110, 32, 53, 49, 46, 48, 46, 48, 25, 124, 28, + 0, 0, 28, 0, 0, 28, 0, 0, 28, 0, 0, 28, 0, 0, 28, 0, 0, 28, 0, 0, 0, 214, 5, 0, 0, 80, 65, + 82, 49, ]; #[test] @@ -280,8 +260,9 @@ mod tests { &mut input, &mut output, 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::try_new(6).unwrap()), false, + WriterVersion::PARQUET_2_0, ) .unwrap(); assert_eq!(output, *PARQUET_FILE); @@ -297,8 +278,9 @@ mod tests { &mut input, &mut output, 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::try_new(6).unwrap()), false, + WriterVersion::PARQUET_2_0, ); assert!(result.is_err()); @@ -321,8 +303,9 @@ mod tests { &mut input, &format, 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::try_new(6).unwrap()), false, + WriterVersion::PARQUET_2_0, ) .unwrap(); } diff --git a/src/name2data.rs b/src/name2data.rs index 09bc2bc..d0d0241 100644 --- a/src/name2data.rs +++ b/src/name2data.rs @@ -2,16 +2,12 @@ /* std use */ -use arrow2::datatypes::Field; -use std::collections::HashMap; - /* crate use */ -use arrow2::array::MutableArray; -use arrow2::array::MutablePrimitiveArray; -use arrow2::array::TryPush; +use arrow::datatypes::Field; use noodles::vcf::record::genotypes::sample::value::genotype::allele::Phasing; /* project use */ +use crate::columndata::ColumnData; ///Alias of [std::collections::HashMap] that associate a column name and [ColumnData], a proxy of arrow2 datastructure #[derive(Debug)] @@ -20,13 +16,16 @@ pub struct Name2Data(rustc_hash::FxHashMap); impl Name2Data { /// Create a new Name2Data, vcf header is required to add info and genotype column /// length parameter is used to preallocate memory - pub fn new(length: usize, schema: &arrow2::datatypes::Schema) -> Self { + pub fn new(length: usize, schema: &arrow::datatypes::Schema) -> Self { let mut name2data = rustc_hash::FxHashMap::default(); for field in schema.fields.iter() { - name2data.insert( - field.name.clone(), - ColumnData::new(&field.data_type, length), - ); + let nullable = match field.data_type() { + arrow::datatypes::DataType::List(a) => a.is_nullable(), + _ => field.is_nullable(), + }; + + let column = ColumnData::new(field.data_type(), length, field.name(), nullable); + name2data.insert(field.name().to_string(), column); } Name2Data(name2data) } @@ -46,8 +45,8 @@ impl Name2Data { &mut self, record: noodles::vcf::Record, header: &noodles::vcf::Header, - schema: &HashMap, - ) -> std::result::Result<(), arrow2::error::Error> { + schema: &rustc_hash::FxHashMap, + ) -> std::result::Result<(), arrow::error::ArrowError> { let allele_count = record.alternate_bases().len() + 1; for (alt_id, allele) in record.alternate_bases().iter().enumerate() { for (key, column) in self.0.iter_mut() { @@ -80,10 +79,10 @@ impl Name2Data { &mut self, record: &noodles::vcf::Record, header: &noodles::vcf::Header, - schema: &HashMap, + schema: &rustc_hash::FxHashMap, alt_id: usize, allele_count: usize, - ) -> std::result::Result<(), arrow2::error::Error> { + ) -> std::result::Result<(), arrow::error::ArrowError> { let info = record.info(); for key in header.infos().keys() { @@ -93,7 +92,7 @@ impl Name2Data { match info.get(key) { Some(value) => match value { Some(noodles::vcf::record::info::field::Value::Flag) => { - column.push_bool(Some(true)); + column.push_bool(true); } Some(noodles::vcf::record::info::field::Value::Integer(value)) => { column.push_i32(Some(*value)); @@ -356,64 +355,29 @@ impl Name2Data { }, }, None => { - if let Some(field) = schema.get(&key_name) { - match field.data_type { - arrow2::datatypes::DataType::FixedSizeList( - ref field_type, - fixed_size, - ) => match &field_type.data_type() { - arrow2::datatypes::DataType::Int32 => { - column.push_veci32(vec![Some(0); fixed_size])? - } - - arrow2::datatypes::DataType::Float32 => { - column.push_vecf32(vec![Some(0.); fixed_size])? - } - - arrow2::datatypes::DataType::Utf8 => column - .push_vecstring(vec![ - Some("".to_string()); - fixed_size - ])?, - - _ => column.push_null(), - }, - _ => column.push_null(), - } - } else { - unreachable!("{} should be in schema", key_name); - } + unreachable!( + "Since the outermost option is Some, this should be unreachable" + ); } }, None => { if info_def.ty() == noodles::vcf::header::record::value::map::info::Type::Flag { - column.push_bool(Some(false)); + column.push_bool(false); } else { //Handle missing info field, only matters for FixedSizeList - if let Some(field) = schema.get(&key_name) { - match field.data_type { - arrow2::datatypes::DataType::FixedSizeList( - ref field_type, - fixed_size, - ) => match &field_type.data_type() { - arrow2::datatypes::DataType::Int32 => { - column.push_veci32(vec![Some(0); fixed_size])? - } - - arrow2::datatypes::DataType::Float32 => { - column.push_vecf32(vec![Some(0.); fixed_size])? - } - - arrow2::datatypes::DataType::Utf8 => column - .push_vecstring(vec![ - Some("".to_string()); - fixed_size - ])?, - - _ => column.push_null(), - }, + if schema.get(&key_name).is_some() { + match column { + ColumnData::ListFloat(_) => { + column.push_vecf32(vec![])?; + } + ColumnData::ListInt(_) => { + column.push_veci32(vec![])?; + } + ColumnData::ListString(_) => { + column.push_vecstring(vec![])?; + } _ => column.push_null(), //Otherwise, just push null } } else { @@ -431,41 +395,40 @@ impl Name2Data { &mut self, record: &noodles::vcf::Record, header: &noodles::vcf::Header, - schema: &HashMap, + schema: &rustc_hash::FxHashMap, alt_id: usize, allele_count: usize, - ) -> std::result::Result<(), arrow2::error::Error> { + ) -> std::result::Result<(), arrow::error::ArrowError> { for key in header.formats().keys() { for (idx, sample) in header.sample_names().iter().enumerate() { let key_name = format!("format_{}_{}", sample, key); let format_def = header.formats().get(key).unwrap(); if let Some(column) = self.0.get_mut(&key_name) { if let Some(format_field) = record.genotypes().get_index(idx) { - match format_field.get(key) { + match format_field.get(key).flatten() { Some(value) => match value { - Some( + noodles::vcf::record::genotypes::sample::Value::Integer( value, - ), - ) => { + ) + => { column.push_i32(Some(*value)); } - Some( + noodles::vcf::record::genotypes::sample::Value::Float( value, - ), + ) => { column.push_f32(Some(*value)); } - Some( noodles::vcf::record::genotypes::sample::Value::String( value, - ), ) => { if key.to_string()=="GT" { let mut gt_str = String::with_capacity(32);//Arbitrary capacity - if let Some(Ok(gt)) = format_field.genotype() + if let Some(gt) = format_field.genotype().and_then(|g|g.ok()) { + eprintln!("GT: {:?} ({:?},{:?})", gt,record.chromosome(),record.position()); gt.iter().enumerate().for_each(|(i,allele)| { let (position, phasing) = (allele.position(), allele.phasing()); match position { @@ -491,24 +454,20 @@ impl Name2Data { }); } else { - eprintln!("Should be unreachable"); - gt_str.push_str("./."); + unreachable!("If GT is not present, the match arm won't take us there") } column.push_string(gt_str); } else { column.push_string(value.to_string()); } } - Some( noodles::vcf::record::genotypes::sample::Value::Character( - value, - ), + value ) => { column.push_string(value.to_string()); } - Some( - noodles::vcf::record::genotypes::sample::Value::Array(arr), - ) => match arr.clone() { + noodles::vcf::record::genotypes::sample::Value::Array(arr) + => match arr.clone() { noodles::vcf::record::genotypes::sample::value::Array::Integer( array_val, ) => match format_def.number() { @@ -751,78 +710,30 @@ impl Name2Data { }, }, - None => { - if let Some(field) = schema.get(&key_name) { - match field.data_type { - arrow2::datatypes::DataType::FixedSizeList( - ref field_type, - fixed_size, - ) => match &field_type.data_type() { - arrow2::datatypes::DataType::Int32 => { - column.push_veci32(vec![Some(0); fixed_size])? - } - arrow2::datatypes::DataType::Float32 => { - column.push_vecf32(vec![Some(0.); fixed_size])? - } - arrow2::datatypes::DataType::Utf8 => column - .push_vecstring(vec![Some("".to_string()); fixed_size])?, - _ => column.push_null(), - }, - _ => column.push_null(), - } - } else { - unreachable!("{} should be in schema", key_name); - } - }, }, - None => if let Some(field) = schema.get(&key_name) { - match field.data_type { - arrow2::datatypes::DataType::FixedSizeList( - ref field_type, - fixed_size, - ) => match &field_type.data_type() { - arrow2::datatypes::DataType::Int32 => { - column.push_veci32(vec![Some(0); fixed_size])? - } - arrow2::datatypes::DataType::Float32 => { - column.push_vecf32(vec![Some(0.); fixed_size])? - } - arrow2::datatypes::DataType::Utf8 => column - .push_vecstring(vec![Some("".to_string()); fixed_size])?, - _ => column.push_null(), - }, + None => if schema.get(&key_name).is_some() { + match column { + ColumnData::ListFloat(_) => { + column.push_vecf32(vec![])?; + } + ColumnData::ListInt(_) => { + column.push_veci32(vec![])?; + } + ColumnData::ListString(_) => { + column.push_vecstring(vec![])?; + } + _ if key.to_string() == "GT" => { + column.push_string("./.".to_string()); + } _ => column.push_null(), } } else { unreachable!("{} should be in schema", key_name); }, } - } else { - //Handle missing format field, only matters for FixedSizeList - if let Some(field) = schema.get(&key_name) { - match field.data_type { - arrow2::datatypes::DataType::FixedSizeList( - ref field_type, - fixed_size, - ) => match &field_type.data_type() { - arrow2::datatypes::DataType::Int32 => { - column.push_veci32(vec![Some(0); fixed_size])? - } - - arrow2::datatypes::DataType::Float32 => { - column.push_vecf32(vec![Some(0.); fixed_size])? - } - - arrow2::datatypes::DataType::Utf8 => column - .push_vecstring(vec![Some("".to_string()); fixed_size])?, - - _ => column.push_null(), - }, - _ => column.push_null(), - } - } else { - unreachable!("Malformed VCF, {} should be in schema", key_name); - } + } + else { + todo!("Understand how we could get there (the tests never did)"); } } } @@ -833,247 +744,97 @@ impl Name2Data { ///Convert Name2Data in vector of arrow2 array pub fn into_arc( mut self, - schema: &arrow2::datatypes::Schema, - ) -> Vec> { - let s: Vec> = schema + schema: &arrow::datatypes::Schema, + ) -> Vec> { + schema .fields .iter() - .map(|x| self.0.remove(&x.name).unwrap().into_arc()) - .collect(); - - s - } -} - -#[derive(Debug)] -pub enum ColumnData { - Bool(arrow2::array::MutableBooleanArray), - Int(arrow2::array::MutablePrimitiveArray), - Float(arrow2::array::MutablePrimitiveArray), - String(arrow2::array::MutableUtf8Array), - ListBool(arrow2::array::MutableListArray), - ListInt(arrow2::array::MutableListArray>), - ListFloat(arrow2::array::MutableListArray>), - ListString(arrow2::array::MutableListArray>), -} - -impl ColumnData { - pub fn new(arrow_type: &arrow2::datatypes::DataType, length: usize) -> Self { - match arrow_type { - arrow2::datatypes::DataType::Boolean => { - ColumnData::Bool(arrow2::array::MutableBooleanArray::with_capacity(length)) - } - arrow2::datatypes::DataType::Int32 => ColumnData::Int( - arrow2::array::MutablePrimitiveArray::::with_capacity(length), - ), - arrow2::datatypes::DataType::Float32 => ColumnData::Float( - arrow2::array::MutablePrimitiveArray::::with_capacity(length), - ), - arrow2::datatypes::DataType::Utf8 => ColumnData::String( - arrow2::array::MutableUtf8Array::::with_capacity(length), - ), - arrow2::datatypes::DataType::List(field) => match field.data_type() { - arrow2::datatypes::DataType::Boolean => { - ColumnData::ListBool(arrow2::array::MutableListArray::< - i32, - arrow2::array::MutableBooleanArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Int32 => { - ColumnData::ListInt(arrow2::array::MutableListArray::< - i32, - MutablePrimitiveArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Float32 => { - ColumnData::ListFloat(arrow2::array::MutableListArray::< - i32, - MutablePrimitiveArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Utf8 => { - ColumnData::ListString(arrow2::array::MutableListArray::< - i32, - arrow2::array::MutableUtf8Array, - >::with_capacity(length)) - } - _ => todo!(), - }, - arrow2::datatypes::DataType::FixedSizeList(field, _) => match field.data_type() { - arrow2::datatypes::DataType::Boolean => { - ColumnData::ListBool(arrow2::array::MutableListArray::< - i32, - arrow2::array::MutableBooleanArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Int32 => { - ColumnData::ListInt(arrow2::array::MutableListArray::< - i32, - MutablePrimitiveArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Float32 => { - ColumnData::ListFloat(arrow2::array::MutableListArray::< - i32, - MutablePrimitiveArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Utf8 => { - ColumnData::ListString(arrow2::array::MutableListArray::< - i32, - arrow2::array::MutableUtf8Array, - >::with_capacity(length)) - } - _ => todo!(), - }, - dt => unreachable!("Unsupported arrow type, please check Schema: {:?}", dt), - } - } - /// Add a Null value in array - pub fn push_null(&mut self) { - match self { - ColumnData::Bool(a) => a.push_null(), - ColumnData::Int(a) => a.push_null(), - ColumnData::Float(a) => a.push_null(), - ColumnData::String(a) => a.push_null(), - ColumnData::ListBool(a) => a.push_null(), - ColumnData::ListInt(a) => a.push_null(), - ColumnData::ListFloat(a) => a.push_null(), - ColumnData::ListString(_a) => { - if let Err(e) = self.push_vecstring(vec![None]) { - panic!("ListString {e:?}"); - } - } - } - } - - pub fn len(&self) -> usize { - match self { - ColumnData::Bool(a) => a.len(), - ColumnData::Int(a) => a.len(), - ColumnData::Float(a) => a.len(), - ColumnData::String(a) => a.len(), - ColumnData::ListBool(a) => a.len(), - ColumnData::ListInt(a) => a.len(), - ColumnData::ListFloat(a) => a.len(), - ColumnData::ListString(a) => a.len(), - } - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Add a boolean value in array, if it's not a boolean array failled - pub fn push_bool(&mut self, value: Option) { - match self { - ColumnData::Bool(a) => a.push(value), - _ => todo!(), - } - } - - /// Add a i32 value in array, if it's not a integer array failled - pub fn push_i32(&mut self, value: Option) { - match self { - ColumnData::Int(a) => a.push(value), - _ => todo!(), - } - } - - /// Add a f32 value in array, if it's not a float array failled - pub fn push_f32(&mut self, value: Option) { - match self { - ColumnData::Float(a) => a.push(value), - _ => todo!(), - } - } - - /// Add a string value in array, if it's not a string array failled - pub fn push_string(&mut self, value: String) { - match self { - ColumnData::String(a) => a.push(Some(value)), - _ => todo!(), - } - } - - /// Add a vector of bool value in array, if it's not a vector of bool array failled - pub fn push_vecbool(&mut self, value: Vec>) -> arrow2::error::Result<()> { - match self { - ColumnData::ListBool(a) => a.try_push(Some(value)), - _ => todo!(), - } - } - - /// Add a vector of integer value in array, if it's not a vector of integer array failled - pub fn push_veci32(&mut self, value: Vec>) -> arrow2::error::Result<()> { - match self { - ColumnData::ListInt(a) => a.try_push(Some(value)), - _ => todo!(), - } - } - - /// Add a vector of float value in array, if it's not a vector of float array failled - pub fn push_vecf32(&mut self, value: Vec>) -> arrow2::error::Result<()> { - match self { - ColumnData::ListFloat(a) => a.try_push(Some(value)), - _ => todo!(), - } - } - - /// Add a vector of string value in array, if it's not a vector of string array failled - pub fn push_vecstring(&mut self, value: Vec>) -> arrow2::error::Result<()> { - match self { - ColumnData::ListString(a) => a.try_push(Some(value)), - _ => todo!(), - } - } - - /// Convert ColumnData in Arrow2 array - pub fn into_arc(self) -> std::sync::Arc { - match self { - ColumnData::Bool(a) => a.into_arc(), - ColumnData::Int(a) => a.into_arc(), - ColumnData::Float(a) => a.into_arc(), - ColumnData::String(a) => a.into_arc(), - ColumnData::ListBool(a) => a.into_arc(), - ColumnData::ListInt(a) => a.into_arc(), - ColumnData::ListFloat(a) => a.into_arc(), - ColumnData::ListString(a) => a.into_arc(), - } + .map(|x| self.0.remove(x.name()).unwrap().into_arc()) + .collect() } } #[cfg(test)] mod tests { use crate::schema; + use arrow::array::ArrayBuilder; use super::*; static VCF_FILE: &[u8] = b"##fileformat=VCFv4.3 -##fileDate=20220528 -##source=ClinVar -##reference=GRCh38 -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##FILTER= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= ##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##SAMPLE= -##SAMPLE= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT first second -chr1 100 . A T 50 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_RString=ref,alt;Info_G=1,2,3;Info_u=0,1,2,3,4 GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 1/1:44:1,2,3,5:testA:r,a:1,2,3:0,2,5,6,1 -chr1 200 . C G,CG 60 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42,43;Info_RChar=r,a,A;Info_RString=ref,alt1,alt2;Info_G=1,2,3,4,5,6;Info_u=1,6,3,4,5 GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:2,4,6,8:testA1,testA2:R,A,B:1,2,3,4,5,6:0,2,4 1/2:45:2,1,6,8:testB1,testB2:R,a,b:1,2,3,4,5,6:0,2,4,5,6 -chr2 300 . G A 70 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_RString=ref,alt;Info_G=1,2,3;Info_u=0,1,2,3,4;Flag GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_0 sample_1 +YAR028W 509242864 . a ATg 6 Filter_0 info_Integer_1=-1867486102;info_Integer_2=1180908493,1041698941;info_Integer_A=-207506013;info_Integer_R=-1221871784,-1356802777;info_Integer_G=-496257853,2127853583,-1498117417,-45419278,1783408501;info_Integer_.=2082620030,-344161839,-1022296779,-1007334133;info_Float_1=68.286865;info_Float_2=-96.154594,-23.433853;info_Float_A=-48.782158;info_Float_R=-46.15216,-92.639305;info_Float_G=-7.5115204,74.78337,1.5983124,-8.867523,77.741455;info_Float_.=26.825455;Flag_0info_Character_1=i;info_Character_2=r,[;info_Character_A=g;info_Character_R=M,D;info_Character_G=h,w,\\,v,o;info_Character_.=G;info_String_1=p]ZoXMTgQo;info_String_2=uVGn`JweVD,DUYytzAny[;info_String_A=_POshsqbSj;info_String_R=AdbZcRFrrQ,_[VS^RtSvz;info_String_G=MeTjonYVIn,jLIi`oWogn,tTH\\QXXOiA,LJLnuPtf`S,r^aaSswsvY;info_String_.=CzkT\\Wk_sG GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1876597949:706761235,-251943823:394859496:-1947058767,424473864:1331697703,-73747609,1645597043,-1553292366,-1685240226:300184417:18.381859:55.763123,-25.909782:-23.853012:-65.84661,-26.444412:12.577988,-87.76228,-3.4822464,-95.66553,55.56636:-35.16729,6.755356:H:Y,N:m:[,Q:B,C,g,L,`:c,x:xXYm`NnOG[:K`QKgogYxZ,uNAMyDqpgZ:liSmUzRvGG:XBgqxa[aBw,_ZxxkAFA[o:`OIdJgjZDS,tKauvtaIhw,mmrIgNXcbh,Rd]QWyFOgu,kSjBlBKigq:znOIm[gGXi,[j\\RlwOmAi 0/1:1178180247:-284426189,-998625419:1871179132:2059063854,-2098693212:1608185708,-1406134851,1030174330,-2031052594,1598302707:-419749875,-1478145995,-1699207585,-1247215944:-58.38821:-62.55126,8.762314:-74.02904:-24.794365,46.083145:13.760803,33.24704,-86.315704,60.576385,-14.547348:82.95245,46.642517,90.124435:a:r,Z:o:y,T:n,Q,F,E,n:m,n:aDWuppugIL:wOFhYRxBZH,MqOWyQIIAH:u\\QQqQyZ`t:TnZk`XSq\\I,_HmAWXBIAy:CL_`ebjENF,E`pNSPd^wz,^tZVmq_oBY,JgQ`oPn^Z\\,`bla^yzIWt:gmoGx]WbcW,VPniuT_IlS,skBLwLHlF_,fwwGspJRS\\ 0/1:246980022:-1016832924,1861844708:-8173468:-1069804542,-70068572:-1451768444,-1682870970,-1829205528,-2068943681,363393119:-288960163,1831626585,1958104113:-80.60921:43.23416,-74.28625:-79.06761:53.03195,8.447456:-14.780685,-46.596863,61.897903,29.243942,-69.91906:64.31647:I:P,N:e:v,h:O,T,N,v,r:w:SEWbLtHSUi:CnsIsSMCBy,^pRIQ\\eLD]:QRzYyzV_sz:wqgYJ`TzLK,hHWZiobiKn:dAPiptpPRU,QyBPeNqLaR,rPFJcjVaEr,HHloMTrcoG,yzgqiA_WIL:`ot_PZwl^\\,Uz^rcVndZg,_IpyMneGSa +23 1165400956 . T t 199 . info_Integer_1=-597222189;info_Integer_2=446843965,1432841503;info_Integer_A=-1756403175;info_Integer_R=-1210584642,1067164582;info_Integer_G=-2026752623,1524204480,2063402043,-1671581234,1992411203;info_Integer_.=387204105,-2048329790;info_Float_1=29.60765;info_Float_2=-70.24462,91.82048;info_Float_A=-57.780792;info_Float_R=-19.511703,87.46164;info_Float_G=17.362617,-10.059616,-89.640594,-70.55726,-48.635937;info_Float_.=82.884,-31.403328,-83.54941,-54.887726;Flag_0info_Character_1=];info_Character_2=Y,R;info_Character_A=p;info_Character_R=A,W;info_Character_G=i,y,I,q,i;info_Character_.=w;info_String_1=]gp_[s]vDh;info_String_2=Y\\SmynkIV^,tOuGkqHsiE;info_String_A=QDdbnppEhM;info_String_R=VgQkWCCgEH,r^aAgT^sOf;info_String_G=z\\_iwMGBRH,EQy^RJwkWd,gu]hpIwaVj,iwKORqBPP[,_ShIZ^Mr\\P;info_String_.=zxs[sGNNuy,cmnjXNUPka,QaFrhEZaIB,_TjXJMdWCM GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-871261733:-1500509753,2025272017:1864754769:-1127684339,-447878996:-1851298122,1367475939,1988967275,-439362500,-447904679:-907504720:-73.56331:-86.60319,-31.910011:6.785797:-95.413086,19.286415:9.942863,-23.623634,31.06224,42.57071,92.734314:-51.402973,-25.126984,73.030045:J:K,Y:K:c,^:y,P,V,b,S:U,w,M,X:IKZ]ZMDszw:apRf\\BVTcU,UOJHFcgkaj:fIjt]RZCsd:TtoRPBHoRS,sDF^wkt`MK:boQ]OQxmec,eJfBqcdaUg,To]BkSYKbI,J\\qQxtjZBq,\\nQWJTeYEf:tHujPdNde\\,EWFfR`mig_ 0/1:205411993:1422316187,136922489:-1998113238:-1581743308,-1016113531:234539080,48396474,1428303612,1012371357,-608258082:807181400:-48.021435:56.736588,-5.4926376:2.772995:6.886032,49.36194:72.34972,19.888977,96.9234,-52.7704,-20.327469:-86.68194,-95.58513,40.37178:r:[,s:r:i,g:p,S,t,_,r:I,N,Z,_:RioGPuuW[_:ZM`VMm[mpf,dTaWj\\c`Pm:bjEzGJcxUg:Iaec]IAHYe,yWTlfKaF]e:JYeMoCeFYn,lwAXvGHCdL,yzGwQfB_YF,lpaF[kfilC,xgYHiD]pz`:W[nVSvJsm_,hxR`xIkWis,dNZNX`_eku 0/1:249006189:-1550023525,1431034968:364065807:630293442,-1899991908:1343119655,1148049825,1254870322,-805282094,-913065710:1033469396,811475314,-784376229,1101867431:-84.87093:84.413025,-28.448128:-35.850975:98.89691,85.76083:-7.8424683,-39.988495,-94.08006,-6.4476013,48.44284:90.68361,-75.537186:P:`,g:j:f,t:M,t,l,u,G:D,^:xYp\\]fYNUh:vzefhZ_x]E,FqtKVH]Xvn:Z\\zIZeOkBf:hYfZzwqLzB,drm^rQSCM\\:wEaPIq`oJt,ZfNBcc[uV_,`pF`wSolGO,BI\\aL`htut,^OEbgPyBTf:fEql_^pktX +ENA|LT795502|LT795502.1 525786811 . A ATgA 226 . info_Integer_1=1004917273;info_Integer_2=-1087925856,-1111801609;info_Integer_A=1142924498;info_Integer_R=397636772,575245484;info_Integer_G=631457844,1508219739,2060178753,1508815851,-1692774727;info_Integer_.=915360277;info_Float_1=81.456085;info_Float_2=-97.36381,99.07503;info_Float_A=-17.968132;info_Float_R=23.030853,17.895386;info_Float_G=36.786133,-36.816742,-79.92742,13.375832,-41.70673;info_Float_.=-56.670807,-88.687706;info_Character_1=x;info_Character_2=f,X;info_Character_A=Q;info_Character_R=M,v;info_Character_G=`,B,I,K,K;info_Character_.=s,S,a,E;info_String_1=RULoNvUdVj;info_String_2=YlKPytYpDY,hwIe\\Lokil;info_String_A=\\VZSHlparH;info_String_R=PVoBxilKPl,_s`t`swTzf;info_String_G=FiINYvUJIO,LtzFxYFFJp,mMeZaQtSZU,rFHUKkG]FO,LxQTXnzEJr;info_String_.=hoXwhfvniY,A\\txGAVbNp,SbcLeVnkYI GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1992801692:305092475,-1179215612:-1739296736:-388696167,-659498070:1503808064,-183677921,-849522112,-1806185994,-1784485194:-498574773:14.566895:10.018013,-16.879608:99.71886:0.29120636,-9.131622:80.328705,-26.403976,-37.213943,57.87976,61.69644:58.0925,-70.47134,-59.34813:X:N,q:S:a,_:c,W,U,b,Y:m:^QkYoWratI:ryX^JlAyCx,WGZntpNsOo:FRryqZFoMj:gCO^BOI[ml,VJqiy[VWym:nbtqw^\\zmA,ZiBBJm[Vbv,aNMll`xnfr,nIJf`wjzny,i[qz[mHs_N:rHzB`UssLW,apsPd_lrip,Uih`ROsUql,tnBQQdhtwm 0/1:-24901120:240741600,-335142169:1743578406:-898920674,473452936:266099587,818869222,-1461529615,1643094296,-2054606423:641472069,-1850726656,-386681464,266081312:97.276596:48.907135,-99.147675:-17.34481:36.995285,-15.711685:-74.65256,-87.14964,0.5836029,36.94156,-83.004906:90.52608,5.7993164:b:`,L:w:l,M:G,m,s,V,n:i,p:ZMBpgnUaaU:[fe_qYtEAS,HcYIbeHcgz:kanq^dgO[M:`v^vmOCeeJ,XRDAZIQsWC:czkmbtrEXT,JHm^^jZjSZ,MrQ[yeKALl,hoEeih^Nvf,MYQGSi^Zux:XI_a[M^G[_,czNvmEmcXT,zwNxaRelTy,\\UHxM[KJGU 0/1:60157897:319490021,-742515096:-1289964762:1628004982,235029603:2020442014,308460461,1558271982,1627368865,894042318:1994633465,-1579924515:-10.664841:-62.51185,53.523605:94.14816:2.9510727,44.83983:-5.713913,-74.449394,-56.378246,-46.97802,13.483833:5.34816,65.6853,-95.33172,-58.07209:U:F,W:\\:A,E:t,j,c,F,J:H,R,S,^:`ZVfoyxdDK:gntf_rQo]a,mHMNJLO[`K:]PjxDCRfYV:MMGtGvm[wr,eQeumfsRZL:EqInOsdeDW,xOQBKswphI,nU_PiY]xef,cAHdxvRbFC,kvJ[v^kdcb:fyKXA\\hjfv +ENA|LT795502|LT795502.1 1506498921 . A gT 99 Filter_0 info_Integer_1=1074860489;info_Integer_2=-6784655,1952022752;info_Integer_A=-1765522773;info_Integer_R=1316333577,-554518728;info_Integer_G=-440746192,417172829,1208578807,-1256320970,168283749;info_Integer_.=-67150747,-701563860,1708267257;info_Float_1=-85.47166;info_Float_2=33.09308,37.761444;info_Float_A=99.544266;info_Float_R=-4.276779,27.070168;info_Float_G=-93.02027,-68.755196,-18.597626,-82.3945,94.890884;info_Float_.=45.63858;info_Character_1=R;info_Character_2=B,W;info_Character_A=l;info_Character_R=z,E;info_Character_G=Z,h,F,D,D;info_Character_.=n,G,v;info_String_1=ojZkSfujYX;info_String_2=`ZrZJtq_hx,StcGnLjWNS;info_String_A=k[feQ[mqyE;info_String_R=Grr[rGo^md,GkiXanc\\K\\;info_String_G=Yhij\\pOPji,yYlsCnJSCY,VggsEuC]ad,G^jiYbvsbn,IJmJvG`jzs;info_String_.=CKtxVQFr]_,G^jAnQaGyI,yvzleXG`vO GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-2039921838:1260782784,-2144365597:1110295788:-158846729,1495837063:390766793,1114219927,-790406568,1652554877,-1144133980:-610126444,1736640977:0.7073364:-56.6103,13.725807:-8.711052:-35.14733,55.534668:-41.20214,-69.47873,15.234543,1.8139114,-81.88782:-86.727234:]:f,Q:I:u,_:b,k,C,[,T:b,S,O:hGS[GPZUZu:uQuJZtwYq_,SyDvF`v_[[:ol]TtBXxVP:mFMNUVM`Ir,XbBPeoBkYj:`xGS_`zgey,v\\]bxaFPdJ,lGqnBWyHQI,ynpDFGuSsm,^bGxsDdgIl:q`HG\\bwqSl,GrnuUSzgVy,cRSPjljk_Q,TWbW]MISyd 0/1:925068425:1074963297,820496013:2032912248:-456701844,1354651711:-1215180367,1123368027,-680845673,-332079579,604760814:476241956:-25.982353:-75.11304,97.80142:62.201385:39.84816,-3.5477142:-50.861835,-43.965935,-45.22519,44.636627,64.44443:-27.258255,-56.71892,81.974884:D:g,j:o:C,D:T,v,A,[,r:c,L,I,I:OJBBSJbN`T:DdPsffHJuV,AJInhMhoiR:__FbESuepO:SDF_mGG^JG,YFfWtuVWFc:ObAtvWdiHC,nRn[JyLGrn,AvnzUN`iJP,BD\\thZTCSk,SuJCDkzPGU:U]SgYvoNeJ 0/1:180052409:3916924,-608184065:406358148:1618596409,-143985416:781007994,-768878726,1593943437,-803117731,-914254344:446901758,117973804,20424962:-9.037872:-71.62204,-38.234306:-38.99839:-78.47328,-74.93701:57.11064,53.769928,22.832726,-1.6777267,96.80972:77.72031,85.200424:J:Z,Y:S:V,u:O,],w,n,Q:Z:Hesgu[drFG:pWRnQtXSiX,D`cgIgjITG:D[lL^EIPfl:AOXfKDsetT,GbyjVXojJF:AcGyoIohdU,zIRGXEkwRv,JeuIkD^`cs,MIpZxRusP_,DvXBProTn_:X[sDTJasXv,[nehoK`q^y +chrMT 900574305 . c a 208 Filter_0 info_Integer_1=-523627641;info_Integer_2=828853617,538841733;info_Integer_A=-1070289656;info_Integer_R=1177092376,1248528320;info_Integer_G=1338006213,-939491184,2031520519,1625981257,-1542813010;info_Integer_.=1020802949,-325766450,-1975174725;info_Float_1=86.21178;info_Float_2=55.274292,57.992126;info_Float_A=98.6465;info_Float_R=-56.676746,-78.452255;info_Float_G=-15.1058655,-32.05681,-23.85817,25.53675,-44.79227;info_Float_.=-8.955811,49.62912,72.4005;Flag_0info_Character_1=x;info_Character_2=B,r;info_Character_A=i;info_Character_R=\\,D;info_Character_G=g,v,l,W,T;info_Character_.=g;info_String_1=j`zU\\K`PLc;info_String_2=l_rhk[Kr]b,rFB_aSUBR`;info_String_A=ZQYIsGAof_;info_String_R=HwrUBliWel,scCvWxgE[r;info_String_G=kY`HMgmH_p,IUvLZ]sdba,nVQJ_Fh`ET,QnYUFz`ShS,KXwJfcYmsw;info_String_.=Q]wexXkHyr,fL\\NGDMlkW,jVpWnME[tp,Zz\\hrFyx`] GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1648034443:845207498,1438334805:-821666363:-532302872,-784878946:-1660896800,2008926111,1279825538,-233248668,-1578146061:-1728381686,-1354962949,-2095339305:4.250908:1.8742523,-78.206894:-22.634148:-61.518906,13.15873:-64.15179,17.086502,-25.609276,57.059555,-92.1911:26.885536,-27.22528,-51.00875:F:l,e:M:y,e:^,X,g,x,h:K:LXIC\\KLZsD:Nd[vIMiHJA,^ConDldYtT:CBqgBJnzRq:nOgdeNbqKd,rTYmYwJcQu:WMlQs]gO[a,Kk[sJ[UoxT,HT]XWH^ZTF,IIJXSmrLHg,qo`OJ_hgav:tSEKSQWXRL 0/1:-1351553081:219969512,1955232553:917524488:1134530757,-362836542:-2127154133,-1470782646,-1443121280,488596267,-1560382672:-1061588830,843145750,978368685:67.93184:-0.71754456,86.030624:-60.117985:36.45627,-81.18977:53.986176,92.98938,-65.36742,7.6014786,46.42549:27.035995,-57.711315,99.20256:M:E,b:k:v,o:\\,y,j,N,J:n,J,_:EvnT^AAXoY:cdVJ^MhXbh,y[[LcBZavI:f^EAgXrRhj:A`FeAFUqKi,fsOnJ\\kmaj:JyK`vg^yGK,G`eXThODuq,gRMJ`naYA^,WkywfqqL`h,ItKK^GJvpU:brd_fl`zxc 0/1:-334893903:1812791387,-1702573904:91913024:-1715303171,1720214253:-1065363642,-1781482473,1593677428,-1611378854,-1463000308:-918589861,802242226,-512257664:29.155685:36.117004,53.34468:-73.653984:-47.224236,80.070786:26.598068,-14.040497,75.17987,-24.15435,-7.163788:-31.709908,-72.07062,53.230316,-11.984253:f:c,r:F:Y,B:m,Q,O,J,S:e,x,V:wsZzDGtnRs:rr^refYy\\D,xWTSt]bRdz:pBWEGavAaK:ROjxoClYNb,ppopUzLGgP:LCytHxxfrF,dGG^fvX^iR,uqkLhoqbuy,yFAbgESl[^,SlKPAuDoaN:hbu_MpWRtk,kJqJRFtD`K +X 508903144 . T taAG 107 Filter_1 info_Integer_1=-442012684;info_Integer_2=1242798393,-893635990;info_Integer_A=-1049853993;info_Integer_R=242988245,-245551581;info_Integer_G=992362638,-556141956,-1436766801,1237135939,-1164555077;info_Integer_.=-1890267838;info_Float_1=50.181763;info_Float_2=5.1533203,32.221054;info_Float_A=31.930801;info_Float_R=18.487122,-4.3887863;info_Float_G=-98.10066,-69.57614,91.27092,60.39116,13.878372;info_Float_.=-46.815468,-65.40532;info_Character_1=_;info_Character_2=C,c;info_Character_A=w;info_Character_R=v,g;info_Character_G=v,Q,r,`,W;info_Character_.=\\,j,W;info_String_1=Ol_Gd[f\\tt;info_String_2=DX_Nqhsbft,ZvYZmhftHw;info_String_A=bXt^\\wzwfQ;info_String_R=ANOP[Zjcef,[mLDzYe^Xa;info_String_G=i_XoxRH]Up,N\\qKskBEfm,vceQjVrtTu,_LnQ_[ngn],yd[ZFNmECq;info_String_.=FPpMF]TI[C,skPWfxtNBS,uEV]cCazM[ GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1784107859:-939663,-275551415:1120891123:-769476243,636006815:-1200543946,215752478,-1484326861,-561668774,-176043456:1567908440,-504633662,-461948338:41.60312:89.26175,-28.585121:-25.16079:-93.47296,-97.84482:-64.203835,62.569122,-16.536377,24.606033,-83.769844:-0.48690033,50.683716,-61.86049:u:p,x:G:O,q:J,g,s,K,g:c,l,a,w:mgLADffOAW:OFYzsNT]DN,knpK_\\ZlLw:NIGi]zaPz\\:vcAptCt\\VT,z[VAZFjS[p:tcGuXLiEpv,fCLE[^Bzbh,rkGwaRnhxi,fTMvHtFRwN,RPYaXfd^BF:DOnEKUN[]V,ppBNNWrhhK 0/1:1910075420:-934542610,-878954512:1171320700:1208017031,-340691680:712157209,1065873060,-1366658844,942622778,-783364205:1637019880,-955981111,1346196180,1597225767:-26.175598:93.89627,-80.47638:-29.046753:-60.67419,85.3492:-41.516758,-69.87106,22.452782,21.140648,61.20903:-78.91676,26.818916,-59.74772,-27.64888:k:n,]:U:T,X:B,r,c,R,G:Z:yGqzsPBWTl:AyQbJXbTNE,wQIyxKzeXn:F\\_wSzKzz_:GPJEnUVNAi,OGt^nVFblY:Npi`aeQSMU,_lhlMXNtTB,[QXjBWuj]d,rKRDMyolVK,QciXXonSdu:ycSYShz\\Gk,\\hYbIs`WDD,CfueEstMtZ 0/1:323372379:1922884232,266253196:-2038955647:-206847340,703053779:740621164,1717070470,2056797316,-1709077983,174222777:-229529326,1150748796:12.667084:-49.801563,43.236115:-82.59476:73.92726,70.36862:16.87722,-79.93007,70.9064,95.77089,-38.30352:86.8264:O:D,k:V:j,u:i,I,z,_,J:u:HEuDirTQc]:[`cTQh_wUo,gJqeDnKyu`:TexBlzobtU:YTlxHhXaRn,DAIwrYQqxu:LK`xEnzWq[,zZmLwxn\\m`,rujYx[cES^,ELVaIdJFDY,Z`Fuq`aaii:LTi`XfaE^C,jgR\\cCazHR,v_dFUwq^p^,wrsXkkAV\\I +23 2057099842 . g a 105 . info_Integer_1=-1463169227;info_Integer_2=1864557327,-1832965500;info_Integer_A=-1397690738;info_Integer_R=230018794,906575350;info_Integer_G=1606595129,-440932389,310954072,-1735028992,-71170678;info_Integer_.=-161655144,935253047,9786785;info_Float_1=-99.35045;info_Float_2=-42.93885,-82.69522;info_Float_A=83.79291;info_Float_R=-85.69043,-11.00209;info_Float_G=-60.064484,-27.798103,94.52054,-38.2653,3.591034;info_Float_.=70.61771,-63.402893,30.473663;Flag_0info_Character_1=g;info_Character_2=V,g;info_Character_A=V;info_Character_R=h,j;info_Character_G=G,R,J,\\,F;info_Character_.=`;info_String_1=ukwTtXgA^`;info_String_2=o_mXjM^xZX,WgiO^rhwGP;info_String_A=QGOHr_eINM;info_String_R=JvS[necmAA,BnmyIEEpWY;info_String_G=EPGPQVsCly,YXuaHkTy\\r,l\\kp_Z_cmw,yk`oYPWJTC,jPmpS\\c\\[i;info_String_.=pqZPLvIWXJ,u_QaCDUzqA,dNRpmBctzH GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:883085881:-1163194169,396734570:-864830302:-1015935718,416512239:1055659899,378650980,1719415308,-591094434,-602013467:1562372336,38255854:-96.96286:-22.160027,2.9108505:-76.4101:-31.626724,74.70122:-10.465622,92.86656,66.5076,-47.610092,73.46921:-35.273743,71.3008,-21.861176:N:Z,x:w:D,Q:g,[,i,i,f:b,u,q:rOrdrrnAAf:fkbpBSUynu,ks[PFA_asT:JK\\[phzgTx:sIMyglMmy[,zdTHmmDXr`:mJZqxRqayy,nzIi[BVIeR,TJX_tcRRuR,nwx\\LjKCPX,EMgDVxWAhQ:urxfXYFYWh,yxg_KbMfiV,O^NfHdYiTT,RQLQTrA`[D 0/1:712995789:-341947022,602595428:496080894:-317586736,-1599924675:1901874239,-1170280909,-1426570445,1489049109,-2000710204:448535337,-1814547565:99.9409:73.24715,-61.679005:42.36447:-67.20648,-37.822533:-44.426228,0.8399048,61.94937,67.76169,-47.838783:96.54474:I:l,g:W:f,T:C,j,],k,a:y,G,Q:nuKQNGtG`H:rk_WztmXlA,rCVaSXG`vY:XNJj\\lgAgb:qKSozwOYIa,K\\rGODc_`q:i_rCsVxkls,YErisA[XyI,nDyzaAV^te,NlxAGcHuIw,CYzN[Gykcx:bDyCEM_Ntk,hVlhY\\KH\\V,TyRCuSB`sn,XEuXmmuCrD 0/1:-1170229269:-1356256190,225279207:869161357:771625610,-569908878:293513696,-619213311,-1755999259,1604615807,1087899712:-745470529,2144376132,-1224677810:-28.347206:-60.524513,-69.2384:-86.569954:-58.52585,-74.32852:-59.802223,77.79262,69.73915,72.24533,-7.9125137:34.921783:N:h,i:u:i,l:L,g,L,B,g:S,c:^y[zODvjKw:NpMKSd_yN],vkeZUtFWRR:oqiWbtGZPN:ItYyF]PgmM,e\\qT_MdBPH:p[n_BBNiqU,my^WxdBmGo,RB__ZCtWd\\,bx_t`szKbQ,qQuZ`\\jq[C:LzEcfVrmKJ,\\QgmcLzgxq +NC_000015.10 278483743 . c Gc 68 Filter_0 info_Integer_1=-1771300013;info_Integer_2=611485162,1796725452;info_Integer_A=971438374;info_Integer_R=698255143,905472298;info_Integer_G=-200904731,1733482657,-1601571925,-95180709,852757134;info_Integer_.=1226331110,1800309665;info_Float_1=-8.41539;info_Float_2=-91.844246,33.56476;info_Float_A=-70.37154;info_Float_R=-88.31048,49.067856;info_Float_G=-56.826736,-72.017075,58.757156,-87.95636,27.40886;info_Float_.=-93.33689,-11.933228;info_Character_1=V;info_Character_2=F,H;info_Character_A=];info_Character_R=],k;info_Character_G=b,j,c,v,k;info_Character_.=W,J,`;info_String_1=[sg[NfQUjS;info_String_2=tuIDx]qY`n,sYihzjCcDX;info_String_A=rWeIJoLqif;info_String_R=Lc\\nOEn`SI,ohc]_`UFau;info_String_G=ts\\z[cGhVY,FWhsZospCl,EibJWC`AtQ,mNRPCdUKvw,lVzUvcofUf;info_String_.=ivsywIpK\\E,dw[sIibpcF,ngkSonUgLJ,OXEnvoSKPb GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:1053018922:-184938975,-1760026633:-857964358:-2102716665,1815665606:-1175872690,-347280558,1231790968,-790356303,-222139945:-1345714777,160079922:-82.493515:64.684265,-99.60785:-19.693016:-3.5498886,-99.20609:-5.4513702,-81.74608,-6.946541,-87.49165,-31.873795:-0.010063171:x:n,^:R:b,h:h,c,],\\,X:I:LWSccGJLM_:BFr_AeHgBF,HkaUOy`jqz:FGMQKwoeFA:^WCLcfxttz,YAipFMM\\Sa:WfUeg^ehSy,Rn^uqvYSmS,sEKSkLrC\\p,QDmf[JGzbG,QiZIx`^pZN:Kmi[\\ChDrN,OJdBvVq[Af 0/1:1914159851:1114283704,-2014859484:1560783535:1865312897,256404857:863585279,-267380020,-134015412,856986131,1668297008:-1579607045,1788206035:91.887955:-27.877113,-57.696247:-67.211655:-94.020226,87.17949:96.09091,-27.167084,99.71431,-84.021736,20.75412:50.480682:r:e,q:w:t,E:R,V,i,I,k:o:jO[Cax\\_zE:beNr\\DPEAj,[rxvLKUNpg:Em^Y^JfmRt:I_pzdiZKvX,KNhZiCPeWc:^rxclcxiEF,bUblZrRMet,KXXitlqZ^r,tDrkWBjrjr,IybJjtMXvv:Qavvd`qH]Z,lXvCHRinb`,EYSpZQdvya 0/1:2112335424:1117304177,1186842567:1518339830:1538701932,1525880826:1156301305,293295010,829335070,-1308097481,567438411:-876069694:-71.12594:80.75633,-92.32674:-51.611877:-83.43153,2.3728333:-16.384102,18.902657,34.962082,26.661896,29.297455:6.421852,-9.252121:u:R,F:\\:],v:A,B,y,p,c:v:miPNAjQGGF:`cKYG[lHBt,wbp^hin^Rk:^kE[kkegww:\\gzDFBIToj,SR]K\\pHId`:KuB]TPhgby,XbLVwenUxV,qxSSu^ko^v,TcVjGbYkIp,_Y`cUrzUBV:dc[XIJDWB_,R_YE]uzONR +NC_016845.1 1273217582 . A cA 185 Filter_1 info_Integer_1=1702504238;info_Integer_2=-1300020074,-1771363986;info_Integer_A=-666582393;info_Integer_R=-1483769984,-1241578554;info_Integer_G=1976807172,-1260807615,-108510257,1277543943,1016305186;info_Integer_.=1829528682,-928482172,-429726805,-2007283327;info_Float_1=70.20416;info_Float_2=94.22778,49.014664;info_Float_A=77.67261;info_Float_R=69.01376,-85.50122;info_Float_G=22.049858,-31.612656,67.47859,42.012314,-0.50154114;info_Float_.=-79.06835,36.144714,-11.66687,-33.392593;Flag_0info_Character_1=Y;info_Character_2=s,w;info_Character_A=Y;info_Character_R=E,S;info_Character_G=J,C,C,^,N;info_Character_.=s;info_String_1=RnqfrhRxGK;info_String_2=QCczZsqSMX,UadliszTvD;info_String_A=UeAmTxgIJs;info_String_R=RpC[yfli[m,UMXScERIAT;info_String_G=Piyg[YSyn],wrVNLOsrsd,CuaYlZzSG`,hHfxMnZBYb,fELmQbwhQV;info_String_.=MHfN_MSgEe,_REWJxavTD,SkkKdTmDLG,R[oCTWMP\\K GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1879771452:-2024174827,895533931:25454566:-1413372025,-1730678484:1651476894,522323445,-801323168,-692607812,1081910993:-1359195475:64.78084:98.23514,-95.097374:97.43535:-65.953636,7.431885:74.098724,-84.2887,-56.68762,-86.44216,96.5443:32.076004:K:G,\\:G:W,z:h,o,j,X,Y:e,G,x:nbjlHj^`q]:aa^uI^^SoQ,JyJ\\ARpaJg:MniIYiZryL:qDr]fJV]eR,iuDxSPv[oY:qLjrYY]bPA,rejiDo^By[,`mxXgnjkPa,_grwZxX`kA,\\OLO_zFEeT:xJR]YarNNn 0/1:380883717:-62474379,-1769613882:-829056637:57433667,1227514553:68322866,1729576571,953880816,-1186526990,-1862887320:-1537612724:-98.82541:24.12381,-75.1378:-59.668877:57.233078,80.90968:96.60881,-88.86273,-30.641846,21.3125,-24.467636:-39.953922,-20.188927,99.38843,-5.183769:u:l,b:v:H,D:v,V,q,f,y:L,[:I]cUryB`WS:hRJHmG[JFU,[kaZP^eEbq:X]cDdXDjYF:tDkOUi`GjU,dwC`DsgfEO:lQmOhgcIB],ZNRGbnuLAI,j]CjtQNOW\\,cyS`hBmOSp,[E[YiTVZbZ:aGTIEgKEx] 0/1:-279336022:-1084797147,1134420570:-567169692:-1283635727,-362348689:1152882561,1764606448,29946627,-1092833737,-1928697170:-1487395248,-1139333107,431473979,158203585:92.27652:-53.75874,35.170288:-12.784073:-46.073364,63.364838:-34.01854,-96.420715,-46.4582,-62.54904,-3.9217987:75.17345,-85.619095,-76.64981,66.046524:Q:Q,j:w:h,s:q,R,x,w,e:B,G,V,b:swXpP]Tmxr:S^AnbSXQXf,LQJBTUgAex:LciDfNSEuH:Ls]QTpWGBO,X[Jns_eDFr:QKqHwAUcRa,bzPOGoHhNR,Ea^NYFQRqd,ClLx\\^fCXL,\\ROGUiBQUj:styjseYDT_,aEHA[zxJlW,eLIiDr_aGn,RTsLkpJjgO +ENA|LT795502|LT795502.1 566884162 . t c 22 . info_Integer_1=-63306296;info_Integer_2=1391506844,-1503768112;info_Integer_A=340548256;info_Integer_R=-1286314818,288781403;info_Integer_G=-800469678,-1311787939,-793948174,1533475939,755254594;info_Integer_.=-1341990003;info_Float_1=-76.227356;info_Float_2=-54.977512,-39.39898;info_Float_A=-35.61332;info_Float_R=-70.32056,-42.79394;info_Float_G=67.78093,43.006317,92.26671,-48.16651,4.3726654;info_Float_.=-60.336803,-45.87288,92.96947,-43.244385;info_Character_1=[;info_Character_2=c,J;info_Character_A=R;info_Character_R=o,d;info_Character_G=h,`,F,\\,q;info_Character_.=T;info_String_1=q^HZe_mW_C;info_String_2=FPSDvSVXAd,YbrjDSdRXm;info_String_A=IxDTHZYoq[;info_String_R=OsOWlbXzO\\,hAhG_b\\Ifw;info_String_G=jb^GYiHZRT,_[`_aqmUIf,PtWWNPUINQ,WkqQaaxSee,jRMUC_IYwu;info_String_.=ZVqn\\yRJEI,`vlpPiWkLZ,aVHocDfVJv GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:389250658:-1173892904,-995837010:380428736:-350796083,-1946061625:-1985077526,-956832721,-2103216081,1213731248,-1361646347:212134446:-93.3871:73.895645,-82.49681:-59.703255:-53.21877,-11.0794525:98.62854,-40.406464,36.850067,-61.214233,28.269058:-80.0885,25.734207,92.746826,24.650955:Q:l,w:j:t,Z:K,X,a,E,S:i:VAXYF^LWPG:SudBRfeYRI,axYzALsh[m:gWvHMgghOt:cIIIEUOOnN,Q`yNRLvwIx:HeiQgtTGFY,A[RlKUJYGM,EDyo[bNg]Z,[DQbbRhs\\H,DNoj_HFJZ]:u[WuJ]OfAC,ToajkjZMqO 0/1:1247618239:1495558316,1270330192:-1812953658:2099386438,-1719636933:-1719318579,-2036965806,-1361738579,-438246128,154780382:-2087289599:99.94664:-24.0057,11.140228:-54.74951:41.22667,-17.469597:62.76808,-47.069477,-82.23286,97.09668,34.973145:-90.37955,-0.9262085,87.107376,69.280334:y:M,T:Z:Z,`:K,y,f,N,c:s:DmKvSoUTTo:zjlBpBcCYU,wEhOIx\\sXm:YYmcVdAtGt:UlifazYxMd,snMxUXwcD^:AHixNQliMD,JdpvgRsGQe,fPSRoIIRVL,AHM[ETIVla,GXPIPLRtqa:pvvKB`NrwP,lTl[KlJinZ 0/1:-1942376961:1024730712,39811746:1702586628:1394978346,832590142:1113681009,1611955235,-169392027,706295232,-1382855589:1043164434,456932470,-1198813064:74.17302:79.9146,-59.669567:-14.853073:-39.896774,-26.528046:71.47615,-73.95854,14.603233,-59.66177,-35.773087:8.042595,-81.39966:i:\\,`:s:Y,S:f,y,L,s,e:W,a,s:vLy\\C]]Bkb:cU[]e_icry,peVDqFyCOL:pmMspaUUXk:uIhBoRPWTP,^jGL[\\`Ei]:CFKAibZSAV,]jyZA_dhSu,UVV]AtIjJu,Than`WdhfE,GzqGzeGtmq:vj\\WHa^Crz,lzLRWOruNj,[V\\LWB]XnM "; #[test] @@ -1090,53 +851,98 @@ chr2 300 . G A 70 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_R assert_eq!( col_names, vec![ - "alternate".to_string(), - "chromosome".to_string(), - "filter".to_string(), - "format_first_Format_1".to_string(), - "format_first_Format_A".to_string(), - "format_first_Format_G".to_string(), - "format_first_Format_R".to_string(), - "format_first_Format_fixed".to_string(), - "format_first_Format_u".to_string(), - "format_first_GT".to_string(), - "format_second_Format_1".to_string(), - "format_second_Format_A".to_string(), - "format_second_Format_G".to_string(), - "format_second_Format_R".to_string(), - "format_second_Format_fixed".to_string(), - "format_second_Format_u".to_string(), - "format_second_GT".to_string(), - "identifier".to_string(), - "info_Flag".to_string(), - "info_Info1".to_string(), - "info_Info_A".to_string(), - "info_Info_G".to_string(), - "info_Info_RChar".to_string(), - "info_Info_RString".to_string(), - "info_Info_fixed".to_string(), - "info_Info_u".to_string(), - "position".to_string(), - "quality".to_string(), - "reference".to_string(), + "alternate", + "chromosome", + "filter", + "format_sample_0_GT", + "format_sample_0_format_Character_.", + "format_sample_0_format_Character_1", + "format_sample_0_format_Character_2", + "format_sample_0_format_Character_A", + "format_sample_0_format_Character_G", + "format_sample_0_format_Character_R", + "format_sample_0_format_Float_.", + "format_sample_0_format_Float_1", + "format_sample_0_format_Float_2", + "format_sample_0_format_Float_A", + "format_sample_0_format_Float_G", + "format_sample_0_format_Float_R", + "format_sample_0_format_Integer_.", + "format_sample_0_format_Integer_1", + "format_sample_0_format_Integer_2", + "format_sample_0_format_Integer_A", + "format_sample_0_format_Integer_G", + "format_sample_0_format_Integer_R", + "format_sample_0_format_String_.", + "format_sample_0_format_String_1", + "format_sample_0_format_String_2", + "format_sample_0_format_String_A", + "format_sample_0_format_String_G", + "format_sample_0_format_String_R", + "format_sample_1_GT", + "format_sample_1_format_Character_.", + "format_sample_1_format_Character_1", + "format_sample_1_format_Character_2", + "format_sample_1_format_Character_A", + "format_sample_1_format_Character_G", + "format_sample_1_format_Character_R", + "format_sample_1_format_Float_.", + "format_sample_1_format_Float_1", + "format_sample_1_format_Float_2", + "format_sample_1_format_Float_A", + "format_sample_1_format_Float_G", + "format_sample_1_format_Float_R", + "format_sample_1_format_Integer_.", + "format_sample_1_format_Integer_1", + "format_sample_1_format_Integer_2", + "format_sample_1_format_Integer_A", + "format_sample_1_format_Integer_G", + "format_sample_1_format_Integer_R", + "format_sample_1_format_String_.", + "format_sample_1_format_String_1", + "format_sample_1_format_String_2", + "format_sample_1_format_String_A", + "format_sample_1_format_String_G", + "format_sample_1_format_String_R", + "identifier", + "info_info_Character_.", + "info_info_Character_1", + "info_info_Character_2", + "info_info_Character_A", + "info_info_Character_G", + "info_info_Character_R", + "info_info_Flag_0", + "info_info_Float_.", + "info_info_Float_1", + "info_info_Float_2", + "info_info_Float_A", + "info_info_Float_G", + "info_info_Float_R", + "info_info_Integer_.", + "info_info_Integer_1", + "info_info_Integer_2", + "info_info_Integer_A", + "info_info_Integer_G", + "info_info_Integer_R", + "info_info_String_.", + "info_info_String_1", + "info_info_String_2", + "info_info_String_A", + "info_info_String_G", + "info_info_String_R", + "position", + "quality", + "reference" ] ); - assert_eq!( - format!("{:?}", data.get("chromosome")), - format!( - "{:?}", - Some(&ColumnData::String(arrow2::array::MutableUtf8Array::new())) - ) - ); - - assert_eq!( - format!("{:?}", data.get_mut("chromosome")), - format!( - "{:?}", - Some(&ColumnData::String(arrow2::array::MutableUtf8Array::new())) - ) - ); + match data.get_mut("chromosome") { + Some(ColumnData::String(a)) => assert_eq!( + a.finish(), + arrow::array::StringBuilder::with_capacity(10, 10 * 10).finish() + ), + _ => panic!("Column chromosome not match type"), + } } #[test] @@ -1146,12 +952,11 @@ chr2 300 . G A 70 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_R let header: noodles::vcf::Header = reader.read_header().unwrap(); let schema = schema::from_header(&header, false).unwrap(); - let schema_map: HashMap = schema - .fields - .iter() - .cloned() - .map(|f| (f.name.clone(), f)) - .collect(); + let schema_map: rustc_hash::FxHashMap = schema + .all_fields() + .into_iter() + .map(|f| (f.name().to_string(), f.clone())) + .collect::>(); let mut data = Name2Data::new(10, &schema); @@ -1159,100 +964,95 @@ chr2 300 . G A 70 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_R let record = iterator.next().unwrap().unwrap(); data.add_record(record, &header, &schema_map).unwrap(); - assert_eq!(format!("{:?}", data.get("alternate")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1]), values: [84] }, validity: None }))".to_string()); - - assert_eq!(format!("{:?}", data.get("chromosome")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 4]), values: [99, 104, 114, 49] }, validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("filter")), "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 1]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 4]), values: [80, 65, 83, 83] }, validity: None }, validity: None }))".to_string()); - - assert_eq!( - format!("{:?}", data.get("format_first_Format_1")), - "Some(Int(MutablePrimitiveArray { data_type: Int32, values: [44], validity: None }))" - .to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_first_Format_A")), - "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 5]), values: [116, 101, 115, 116, 65] }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_first_Format_G")), - "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 3]), values: MutablePrimitiveArray { data_type: Int32, values: [1, 2, 3], validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_first_Format_R")), - "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 2]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1, 2]), values: [82, 65] }, validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_first_Format_fixed")), - "Some(ListFloat(MutableListArray { data_type: List(Field { name: \"item\", data_type: Float32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 4]), values: MutablePrimitiveArray { data_type: Float32, values: [1.0, 2.0, 3.0, 4.0], validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_first_Format_u")), - "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 4]), values: MutablePrimitiveArray { data_type: Int32, values: [0, 2, 4, 6], validity: None }, validity: None }))".to_string() - ); - assert_eq!(format!("{:?}", data.get("format_first_GT")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 3]), values: [48, 47, 49] }, validity: None }))".to_string()); - - assert_eq!( - format!("{:?}", data.get("format_second_Format_1")), - "Some(Int(MutablePrimitiveArray { data_type: Int32, values: [44], validity: None }))" - .to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_second_Format_A")), - "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 5]), values: [116, 101, 115, 116, 65] }, validity: None }))".to_string() - ); - - assert_eq!( - format!("{:?}", data.get("format_second_Format_G")), - "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 3]), values: MutablePrimitiveArray { data_type: Int32, values: [1, 2, 3], validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_second_Format_R")), - "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 2]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1, 2]), values: [114, 97] }, validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_second_Format_fixed")), - "Some(ListFloat(MutableListArray { data_type: List(Field { name: \"item\", data_type: Float32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 4]), values: MutablePrimitiveArray { data_type: Float32, values: [1.0, 2.0, 3.0, 5.0], validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_second_Format_u")), - "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 5]), values: MutablePrimitiveArray { data_type: Int32, values: [0, 2, 5, 6, 1], validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_second_GT")), - "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 3]), values: [49, 47, 49] }, validity: None }))".to_string() - ); - - assert_eq!(format!("{:?}", data.get("identifier")), "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 0]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0]), values: [] }, validity: None }, validity: None }))".to_string()); - - assert_eq!(format!("{:?}", data.get("info_Flag")), "Some(Bool(MutableBooleanArray { data_type: Boolean, values: [0b_______0], validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("info_Info1")), "Some(Float(MutablePrimitiveArray { data_type: Float32, values: [0.0], validity: Some([0b_______0]) }))".to_string()); - assert_eq!( - format!("{:?}", data.get("info_Info_A")), - "Some(Int(MutablePrimitiveArray { data_type: Int32, values: [42], validity: None }))" - .to_string() - ); - assert_eq!(format!("{:?}", data.get("info_Info_G")), "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 3]), values: MutablePrimitiveArray { data_type: Int32, values: [1, 2, 3], validity: None }, validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("info_Info_RChar")), "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 2]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1, 2]), values: [114, 97] }, validity: None }, validity: None }))".to_string()); - assert_eq!( - format!("{:?}", data.get("info_Info_RString")), - "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 2]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 3, 6]), values: [114, 101, 102, 97, 108, 116] }, validity: None }, validity: None }))".to_string() - ); - assert_eq!(format!("{:?}", data.get("info_Info_fixed")), "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 3]), values: MutablePrimitiveArray { data_type: Int32, values: [1, 2, 3], validity: None }, validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("info_Info_u")), "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 5]), values: MutablePrimitiveArray { data_type: Int32, values: [0, 1, 2, 3, 4], validity: None }, validity: None }))".to_string()); - - assert_eq!( - format!("{:?}", data.get("position")), - "Some(Int(MutablePrimitiveArray { data_type: Int32, values: [100], validity: None }))" - .to_string() - ); - assert_eq!(format!("{:?}", data.get("quality")), "Some(Float(MutablePrimitiveArray { data_type: Float32, values: [50.0], validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("reference")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1]), values: [65] }, validity: None }))".to_string()); - - let record = iterator.next().unwrap().unwrap(); - let mut data = Name2Data::new(10, &schema); - data.add_record(record, &header, &schema_map).unwrap(); - - assert_eq!(format!("{:?}", data.get("alternate")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1, 3]), values: [71, 67, 71] }, validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("filter")), "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 1, 2]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 4, 8]), values: [80, 65, 83, 83, 80, 65, 83, 83] }, validity: None }, validity: None }))".to_string()); + match data.get("chromosome") { + Some(ColumnData::String(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), b"YAR028W"); + assert_eq!(a.offsets_slice(), &[0, 7]); + } + _ => panic!("Column chromosome does not match type"), + } + match data.get("position") { + Some(ColumnData::Int(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), &[509242864]); + } + _ => panic!("Column position does not match type"), + } + match data.get("identifier") { + Some(ColumnData::ListString(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_ref().values_slice(), b""); + assert_eq!(a.offsets_slice(), &[0, 0]); + } + _ => panic!("Column identifier does not match type"), + } + match data.get("reference") { + Some(ColumnData::String(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), b"A"); + assert_eq!(a.offsets_slice(), &[0, 1]); + } + _ => panic!("Column reference does not match type"), + } + match data.get("alternate") { + Some(ColumnData::String(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), b"ATG"); + assert_eq!(a.offsets_slice(), &[0, 3]); + } + _ => panic!("Column alternate does not match type"), + } + match data.get("quality") { + Some(ColumnData::Float(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), &[6.]); + } + _ => panic!("Column quality does not match type"), + } + match data.get("filter") { + Some(ColumnData::ListString(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_ref().len(), 1); + assert_eq!(a.values_ref().values_slice(), b"Filter_0"); + assert_eq!(a.values_ref().offsets_slice(), &[0, 8]); + assert_eq!(a.offsets_slice(), &[0, 1]); + } + _ => panic!("Column filter does not match type"), + } + match data.get("info_info_Integer_1") { + Some(ColumnData::Int(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), &[-1867486102]); + } + _ => panic!("Column info_info_Integer_1 does not match type"), + } + match data.get("info_info_Integer_2") { + Some(ColumnData::ListInt(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_ref().len(), 2); + assert_eq!(a.values_ref().values_slice(), &[1180908493, 1041698941]); + assert_eq!(a.offsets_slice(), &[0, 2]); + assert_eq!(a.offsets_slice(), &[0, 2]); + } + _ => panic!("Column info_info_Integer_2 does not match type"), + } + match data.get("info_info_Integer_A") { + Some(ColumnData::Int(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), &[-207506013]); + } + _ => panic!("Column info_info_Integer_A does not match type"), + } + match data.get("info_info_Integer_R") { + Some(ColumnData::ListInt(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_ref().len(), 2); + assert_eq!(a.values_ref().values_slice(), &[-1221871784, -1356802777]); + assert_eq!(a.offsets_slice(), &[0, 2]); + assert_eq!(a.offsets_slice(), &[0, 2]); + } + _ => panic!("Column info_info_Integer_R does not match type"), + } } } diff --git a/src/record2chunk.rs b/src/record2chunk.rs index 85dcc39..abaac95 100644 --- a/src/record2chunk.rs +++ b/src/record2chunk.rs @@ -4,17 +4,17 @@ /* crate use */ -use arrow2::datatypes::Field; +use arrow::datatypes::Field; /* project use */ use crate::name2data::*; +/// Convert vcf record iterator into Parquet chunk pub struct Record2Chunk { inner: T, length: usize, header: noodles::vcf::Header, - schema: arrow2::datatypes::Schema, - schema_map: std::collections::HashMap, + schema: std::sync::Arc, end: bool, } @@ -22,40 +22,20 @@ impl Record2Chunk where T: Iterator>, { + /// Create a new Record2Chunk pub fn new( inner: T, length: usize, header: noodles::vcf::Header, - schema: arrow2::datatypes::Schema, + schema: std::sync::Arc, ) -> Self { - let mut res = Self { + Self { inner, length, header, schema, - schema_map: Default::default(), end: false, - }; - res.schema_map = res - .schema - .fields - .iter() - .cloned() - .map(|f| (f.name.clone(), f)) - .collect(); - res - } - - pub fn encodings(&self) -> Vec> { - self.schema - .fields - .iter() - .map(|f| { - arrow2::io::parquet::write::transverse(&f.data_type, |_| { - arrow2::io::parquet::write::Encoding::Plain - }) - }) - .collect() + } } } @@ -63,10 +43,7 @@ impl Iterator for Record2Chunk where T: Iterator>, { - type Item = Result< - arrow2::chunk::Chunk>, - arrow2::error::Error, - >; + type Item = Result; fn next(&mut self) -> Option { if self.end { @@ -78,23 +55,36 @@ where for _ in 0..self.length { match self.inner.next() { Some(Ok(record)) => { - if let Err(e) = name2data.add_record(record, &self.header, &self.schema_map) { + if let Err(e) = name2data.add_record( + record, + &self.header, + &self + .schema + .all_fields() + .into_iter() + .map(|f| (f.name().to_string(), f.clone())) + .collect::>(), + ) { return Some(Err(e)); } } - Some(Err(e)) => return Some(Err(arrow2::error::Error::Io(e))), + Some(Err(e)) => { + return Some(Err(arrow::error::ArrowError::IoError("".to_string(), e))) + } None => { self.end = true; - return Some(Ok(arrow2::chunk::Chunk::new( + return Some(arrow::record_batch::RecordBatch::try_new( + self.schema.clone(), name2data.into_arc(&self.schema), - ))); + )); } } } - Some(Ok(arrow2::chunk::Chunk::new( + Some(arrow::record_batch::RecordBatch::try_new( + self.schema.clone(), name2data.into_arc(&self.schema), - ))) + )) } } diff --git a/src/schema.rs b/src/schema.rs index 98b2227..63bb31a 100644 --- a/src/schema.rs +++ b/src/schema.rs @@ -1,6 +1,7 @@ //! Construct parquet schema corresponding to vcf /* std use */ +use std::sync::Arc; /* crate use */ @@ -11,7 +12,7 @@ use crate::*; pub fn from_header( header: &noodles::vcf::Header, info_optional: bool, -) -> error::Result { +) -> error::Result { let mut columns = Vec::new(); // required column @@ -23,30 +24,30 @@ pub fn from_header( // genotype field columns.extend(genotype(header)); - Ok(arrow2::datatypes::Schema::from(columns)) + Ok(arrow::datatypes::Schema::new(columns)) } -fn required_column() -> Vec { +fn required_column() -> Vec { vec![ - arrow2::datatypes::Field::new("chromosome", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("position", arrow2::datatypes::DataType::Int32, false), - arrow2::datatypes::Field::new( + arrow::datatypes::Field::new("chromosome", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("position", arrow::datatypes::DataType::Int32, false), + arrow::datatypes::Field::new( "identifier", - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( - "id", - arrow2::datatypes::DataType::Utf8, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + "identifier", + arrow::datatypes::DataType::Utf8, false, ))), false, ), - arrow2::datatypes::Field::new("reference", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("alternate", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("quality", arrow2::datatypes::DataType::Float32, true), - arrow2::datatypes::Field::new( + arrow::datatypes::Field::new("reference", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("alternate", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("quality", arrow::datatypes::DataType::Float32, true), + arrow::datatypes::Field::new( "filter", - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( "filter", - arrow2::datatypes::DataType::Utf8, + arrow::datatypes::DataType::Utf8, false, ))), false, @@ -54,7 +55,7 @@ fn required_column() -> Vec { ] } -fn info(header: &noodles::vcf::Header, info_optional: bool) -> Vec { +fn info(header: &noodles::vcf::Header, info_optional: bool) -> Vec { let mut fields = Vec::new(); for (name, value) in header.infos() { @@ -62,74 +63,65 @@ fn info(header: &noodles::vcf::Header, info_optional: bool) -> Vec { - arrow2::datatypes::DataType::Int32 + arrow::datatypes::DataType::Int32 } noodles::vcf::header::record::value::map::info::Type::Float => { - arrow2::datatypes::DataType::Float32 + arrow::datatypes::DataType::Float32 } noodles::vcf::header::record::value::map::info::Type::Flag => { - arrow2::datatypes::DataType::Boolean + arrow::datatypes::DataType::Boolean } noodles::vcf::header::record::value::map::info::Type::Character => { - arrow2::datatypes::DataType::Utf8 + arrow::datatypes::DataType::Utf8 } noodles::vcf::header::record::value::map::info::Type::String => { - arrow2::datatypes::DataType::Utf8 + arrow::datatypes::DataType::Utf8 } }; match value.number() { noodles::vcf::header::Number::Count(0 | 1) | noodles::vcf::header::Number::A => fields - .push(arrow2::datatypes::Field::new( + .push(arrow::datatypes::Field::new( &key, arrow_type, info_optional, )), - noodles::vcf::header::Number::R => fields.push(arrow2::datatypes::Field::new( + noodles::vcf::header::Number::R => fields.push(arrow::datatypes::Field::new( &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new( - &key, - arrow_type, - info_optional, - )), - 2, - ), + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, + arrow_type, + info_optional, + ))), info_optional, )), - noodles::vcf::header::Number::Count(n) => fields.push(arrow2::datatypes::Field::new( + noodles::vcf::header::Number::Count(_n) => fields.push(arrow::datatypes::Field::new( &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new( - &key, - arrow_type, - info_optional, - )), - n, - ), - false, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, + arrow_type, + info_optional, + ))), + info_optional, )), - noodles::vcf::header::Number::G => fields.push(arrow2::datatypes::Field::new( + noodles::vcf::header::Number::G => fields.push(arrow::datatypes::Field::new( &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new( - &key, - arrow_type, - info_optional, - )), - 3, - ), - false, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, + arrow_type, + info_optional, + ))), + info_optional, )), - noodles::vcf::header::Number::Unknown => fields.push(arrow2::datatypes::Field::new( + noodles::vcf::header::Number::Unknown => fields.push(arrow::datatypes::Field::new( &key, - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( &key, arrow_type, info_optional, ))), - false, + info_optional, )), } } @@ -137,7 +129,7 @@ fn info(header: &noodles::vcf::Header, info_optional: bool) -> Vec Vec { +fn genotype(header: &noodles::vcf::Header) -> Vec { let mut fields = Vec::new(); for sample in header.sample_names() { @@ -146,59 +138,54 @@ fn genotype(header: &noodles::vcf::Header) -> Vec { let arrow_type = match value.ty() { noodles::vcf::header::record::value::map::format::Type::Integer => { - arrow2::datatypes::DataType::Int32 + arrow::datatypes::DataType::Int32 } noodles::vcf::header::record::value::map::format::Type::Float => { - arrow2::datatypes::DataType::Float32 + arrow::datatypes::DataType::Float32 } noodles::vcf::header::record::value::map::format::Type::Character => { - arrow2::datatypes::DataType::Utf8 + arrow::datatypes::DataType::Utf8 } noodles::vcf::header::record::value::map::format::Type::String => { - arrow2::datatypes::DataType::Utf8 + arrow::datatypes::DataType::Utf8 } }; match value.number() { noodles::vcf::header::Number::Count(0 | 1) | noodles::vcf::header::Number::A => { - fields.push(arrow2::datatypes::Field::new(key, arrow_type, false)) + fields.push(arrow::datatypes::Field::new(key, arrow_type, true)) } - noodles::vcf::header::Number::R => fields.push(arrow2::datatypes::Field::new( + noodles::vcf::header::Number::R => fields.push(arrow::datatypes::Field::new( &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new(&key, arrow_type, false)), - 2, - ), - false, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, arrow_type, true, + ))), + true, )), - noodles::vcf::header::Number::Count(n) => { - fields.push(arrow2::datatypes::Field::new( + noodles::vcf::header::Number::Count(_n) => { + fields.push(arrow::datatypes::Field::new( &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new(&key, arrow_type, false)), - n, - ), - false, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, arrow_type, true, + ))), + true, )) } - noodles::vcf::header::Number::G => fields.push(arrow2::datatypes::Field::new( + noodles::vcf::header::Number::G => fields.push(arrow::datatypes::Field::new( &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new(&key, arrow_type, false)), - 3, - ), - false, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, arrow_type, true, + ))), + true, )), - noodles::vcf::header::Number::Unknown => { - fields.push(arrow2::datatypes::Field::new( - &key, - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( - &key, arrow_type, false, - ))), - false, - )) - } + noodles::vcf::header::Number::Unknown => fields.push(arrow::datatypes::Field::new( + &key, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, arrow_type, true, + ))), + true, + )), } } } @@ -208,6 +195,8 @@ fn genotype(header: &noodles::vcf::Header) -> Vec { #[cfg(test)] mod tests { + use std::sync::Arc; + use super::*; // @@ -237,150 +226,158 @@ mod tests { "; lazy_static::lazy_static! { - static ref MINI_COLS: Vec = vec![ - arrow2::datatypes::Field::new("chromosome", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("position", arrow2::datatypes::DataType::Int32, false), - arrow2::datatypes::Field::new( + static ref MINI_COLS: Vec = vec![ + arrow::datatypes::Field::new("chromosome", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("position", arrow::datatypes::DataType::Int32, false), + arrow::datatypes::Field::new( "identifier", - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( - "id", - arrow2::datatypes::DataType::Utf8, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + "identifier", + arrow::datatypes::DataType::Utf8, false, ))), false, ), - arrow2::datatypes::Field::new("reference", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("alternate", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("quality", arrow2::datatypes::DataType::Float32, true), - arrow2::datatypes::Field::new( + arrow::datatypes::Field::new("reference", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("alternate", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("quality", arrow::datatypes::DataType::Float32, true), + arrow::datatypes::Field::new( "filter", - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( "filter", - arrow2::datatypes::DataType::Utf8, + arrow::datatypes::DataType::Utf8, false, ))), false, ), ]; - static ref INFO_COLS: Vec = vec![ - arrow2::datatypes::Field { name: "info_Flag".to_string(), data_type: arrow2::datatypes::DataType::Boolean, is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info1".to_string(), data_type: arrow2::datatypes::DataType::Float32, is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_fixed".to_string(), data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new(arrow2::datatypes::Field { name: "info_Info_fixed".to_string(), data_type: arrow2::datatypes::DataType::Int32, is_nullable: false, metadata: std::collections::BTreeMap::new() }), 3), is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_A".to_string(), data_type: arrow2::datatypes::DataType::Int32, is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_RString".to_string(), data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new(arrow2::datatypes::Field { name: "info_Info_RString".to_string(), data_type: arrow2::datatypes::DataType::Utf8, is_nullable: false, metadata: std::collections::BTreeMap::new() }), 2), is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_RChar".to_string(), data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new(arrow2::datatypes::Field { name: "info_Info_RChar".to_string(), data_type: arrow2::datatypes::DataType::Utf8, is_nullable: false, metadata: std::collections::BTreeMap::new() }), 2), is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_G".to_string(), data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new(arrow2::datatypes::Field { name: "info_Info_G".to_string(), data_type: arrow2::datatypes::DataType::Int32, is_nullable: false, metadata: std::collections::BTreeMap::new() }), 3), is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_.".to_string(), data_type: arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field { name: "info_Info_.".to_string(), data_type: arrow2::datatypes::DataType::Int32, is_nullable: false, metadata: std::collections::BTreeMap::new() })), is_nullable: false, metadata: std::collections::BTreeMap::new() }]; - - static ref FORMAT_COLS: Vec = vec![ - arrow2::datatypes::Field { - name: "format_first_Format_1".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_first_Format_fixed".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_first_Format_fixed".to_string(), - data_type: arrow2::datatypes::DataType::Float32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),4), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_first_Format_A".to_string(), - data_type: arrow2::datatypes::DataType::Utf8, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_first_Format_R".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_first_Format_R".to_string(), - data_type: arrow2::datatypes::DataType::Utf8, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),2), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_first_Format_G".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_first_Format_G".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),3), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_first_Format_.".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_1".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_fixed".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_second_Format_fixed".to_string(), - data_type: arrow2::datatypes::DataType::Float32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),4), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_A".to_string(), - data_type: arrow2::datatypes::DataType::Utf8, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_R".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_second_Format_R".to_string(), - data_type: arrow2::datatypes::DataType::Utf8, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),2), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_G".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_second_Format_G".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),3), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_.".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } + static ref INFO_COLS: Vec = vec![ + arrow::datatypes::Field::new("info_Flag".to_string(), arrow::datatypes::DataType::Boolean, false), + arrow::datatypes::Field::new("info_Info1".to_string(),arrow::datatypes::DataType::Float32, false), + arrow::datatypes::Field::new( "info_Info_fixed".to_string(), arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new("info_Info_fixed".to_string(),arrow::datatypes::DataType::Int32, false)), ),false), + arrow::datatypes::Field::new("info_Info_A".to_string(),arrow::datatypes::DataType::Int32, false), + arrow::datatypes::Field::new("info_Info_RString".to_string(),arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new("info_Info_RString".to_string(),arrow::datatypes::DataType::Utf8, false)), ), false), + arrow::datatypes::Field::new("info_Info_RChar".to_string(),arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new("info_Info_RChar".to_string(),arrow::datatypes::DataType::Utf8, false)), ), false), + arrow::datatypes::Field::new("info_Info_G".to_string(), arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new("info_Info_G".to_string(),arrow::datatypes::DataType::Int32, false)), ), false), + arrow::datatypes::Field::new("info_Info_.".to_string(), arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new("info_Info_.".to_string(),arrow::datatypes::DataType::Int32, false))), false) + ]; + + static ref FORMAT_COLS: Vec = vec![ + arrow::datatypes::Field::new( + "format_first_Format_1".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ), + arrow::datatypes::Field::new( + "format_first_Format_fixed".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_first_Format_fixed".to_string(), + arrow::datatypes::DataType::Float32, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_first_Format_A".to_string(), + arrow::datatypes::DataType::Utf8, + true, + + ), + arrow::datatypes::Field::new( + "format_first_Format_R".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_first_Format_R".to_string(), + arrow::datatypes::DataType::Utf8, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_first_Format_G".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_first_Format_G".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_first_Format_.".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_1".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_fixed".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_second_Format_fixed".to_string(), + arrow::datatypes::DataType::Float32, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_A".to_string(), + arrow::datatypes::DataType::Utf8, + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_R".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_second_Format_R".to_string(), + arrow::datatypes::DataType::Utf8, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_G".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_second_Format_G".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_.".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ) ]; } @@ -414,14 +411,14 @@ mod tests { let header: noodles::vcf::Header = reader.read_header().unwrap(); - let mut data: Vec = Vec::new(); - data.extend_from_slice(&*MINI_COLS); - data.extend_from_slice(&*INFO_COLS); - data.extend_from_slice(&*FORMAT_COLS); + let mut data: Vec = Vec::new(); + data.extend_from_slice(&MINI_COLS); + data.extend_from_slice(&INFO_COLS); + data.extend_from_slice(&FORMAT_COLS); assert_eq!( from_header(&header, false).unwrap(), - arrow2::datatypes::Schema::from(data) + arrow::datatypes::Schema::new(data.iter().map(|x| x.clone()).collect::>()), ); } } diff --git a/tests/data/test.parquet b/tests/data/test.parquet index 40e9fbec7ea460128fca7ef0003fe92d8e880c86..6dee4aeb2f03a5e7c69fa8929cea21fcde2dfe09 100644 GIT binary patch literal 47127 zcmd^odtg)by?BzQJUWJm<`^6oJBLRm4BA?-Rj+bR(xhpVrcKi}P1E-yP2Ww@=iDy7 zJ7mbvF{X^m7(<50kf9A#E@$$=|{It1g>0?{6pVDXyGv+MSOqn&um;ohvyL&^u z-K_>4!!SE^%r+f+M9XUAZvz{p2DjaCC1GO0h6~-^d3QWISn%A5%4-7EzpQTD@x;Uk1vqG#KTS>9Lqmj|B;78T0k&v#vF@-aj3P(?yA! z*4G;fgaf@jgtV8n%xhY<4=6KA-p-TUBP1Xb0N7CgAZra^&oInH?D%gMJk>)X2L6Fm z1u5iZ1X@~q!hyCxYf#@62!$nSg4)G+*m;0>8AA{(Ff6hgrV#{J+EF_kei1#H@XiWY zXBcM5dDgm(T8?l#310V1o_{V zm4oBj;l)~TF;qNiH#^B0_ z^ezs_G8*QJ0s|$2`RegCBYYoRPQ=D&Cz#2!sF_UdA870fghz;~j%u0Xf|ZD`=gB|; zomqxlJN!W_G5V%~>?{Ffu|7+3FLA33Vm?#AeC968STHSGINK=PWl;(>>HTSG)3j*| zVzxDy6U!nw{O1snr?r^rte(;Umq@i?&tj@>o1*LQ627g__ObwJufr=`&Sfo048t_a8v-%vu_)nf>h>E9 zbJNm3x&89x=~FYuxQwr!-n513JUP3_& z0x^XIfCwhU${#)T!@thyaDMODmTS!4dr6~toIkW~+7r14fA=3<re9@@K*CyyL$33`SMRy$0-RoNBgy0n-EF@e^MG0T8HEaaku|#+F3&oyi7Cv+R zZ!>>-;N9Onv-L-h-o7RLv#WNjvR(U2txhw2+dxU)Gw+=+YQJhZIJL9rw|l-m^DB#M z-@X3+jbFVX@X^v0Lk~W7IPKC?2U^@es|hc=JfZ#&kWr7tmGzfB^}44#q020i+CfQw zu0;BC5&iAgF^6^R0jfU{C5k~zMa6JXYd8#I$Rc8BxKRK6p3b*H)SAnNU;E~Fmgd}l z@b2GxZ_$4;n)P4%-l_fee;YpxPA`1Hd(gRNh32KH4=^LEm$SFr@!s!G{PYRm_IG-x zy#K@FF8||y{njS^Pk%GOJ@WfKzcP(|?0Us^w06(0o_x>w#8D89W~r>7QJ>OcOQmmH7*P^1>ru&pKjz{O8FG+?q&0 zRuIrK=7Ns-yN*3g1tcP7;>Z94-=;O}1OeSZ1mwO@{!RYcJAQrV9e-Z>$*kwU;P~6~ z)6G|275VVYy@sL(m$mvX{bSL^#jk(N9BaMzrsKE1n)BUr&*c}a?=c)HwZ`{4s6 zogFPRy6?GT+e?rCq3QIUwwkeTzp2|L*vn$MzauiE9>G(4S-kA#dEoDt1aS!VlK<7D z{=PVoH~=BDn0Z;pysl$krQ#4#qP>WzsJ%>x(#>X8s%RZjbbI?cg)_ z74m&Qo-#hxzKU-Tua5w4%5fSC8};}a;Y}W=g<9`&F-}{$dK<$EQCftAB#Wsi$?_AyoqX2qd%1&OeUSUp)%S1> zZ;l<@^WbsrlRZt5)qmU&+4Rms+=aeX2S41nD{@DHnV&IrCBI4YcqDVns>r293lnP& z0UPy*Mp<)$zhQh%s5u&)<20ubn?-ae0x=a8fjl-h)Ll4;v6=bKclj-+S8`eJXO3UA z{w&g%u`lwIyS^2<@Z!Vco1W|+Uv#r3GWOpG##i2WC~}c~AoBFer}>9Vnj;6Mt%_(8wGo9EG?n zBE`5YrlP{xt2G>exV)IeW#5IGu}5C#HXQmfufO>p6XZSzq%{(jUD?UE6(3Ge(Bnck*tSj@hcv`d0f9XJ@N-n zf8@1q+!U$1@GQUl?~Bl0%!WcAxI1lY&LouQa|~1F7-vZ%Y@@$E3lK7li>YKaw|ZhP0-zl1N@Ng$&zXfC zvSDrdtuDN&$FEBKssUL=d+0(}0^8*o3$(*Vt&32Q?n_Ufm6oB39e?i zEI@Z5QX9grA^a)=x)&RA;@jNXA)}TjbjMoTGjsJ*XXUuvgHzV;d+Sf2(rE87Pmv88 z$dw>q8a3OVt40(W7t`z~zGr z8L0*0JCWpa{OZN8mV{au6}s&*Ei5k`G(NoV#s3r8P}Tws%8FW`fmx;n0Uxk+XTtaLk|Qm$S_O+)1SRSJ7Ux>rjlQ#NxyX2at+%5 z*x{J{E0at^r6S1~F6S{%Yngpo)`NB3POdTVmg4Y zD&(dJzZ&q%Eht6aJVlEtpsum^(K**b!}PMKYvj16-2ap3-g-M`Tgn{Lh^NRQZJElH zBH#lS*Xs>+h?t3Dpjo+ackqu=kb>RZ;nLbxfe_OEKHEbwZc2@*jIU*A|>!Mjk3W>0-de|xB@4dv?{toC-d zw}lIbAQ2w0nTKops~dW>Gd-c|K>KKisoCs7`IecQN4=r`a!2~i`4x>VFm5^O9yDv` z`Yfg8=E3qx57&als+)ox=JFPoryJCu*e=fy1z^k&`KJb)tc@;&_jt9?*shRI0{fnH zW@9@0X&MdeNzjAB*6IwaV)pzks<8S}XQ(~Y-&77(J62|HY3k|gD`b3U=+Q6JETDflbu-{e`LddpZv%An+-P?~fxnf0AT1D|`9j02nry@!tASL%1O{1{U3W|R%*@V!r?IEmSL_40g+sv#zAI4G zitZ6=HVsw{`wMD<+L@VLiLIhD>H(nocg7(M6|8gUOpBaYUt#f8X` zqt)Hf+f!X>>%q!rvG$7j9d=uwp|`5n55ZynsINXSLtCHinqxG>uKB)@ zE{zzlX|&VrtLm%|S&_HlP)*2MQZg_CiRSK}4o^u%X=fuO${H=f_5yFCjfX_NyTeoD zXsI$eAkp6I8ES7E>){*qN^G60pyIMWghCwITV_EG`>#(P@a0IM&ew*D_Vl_LuI;-H7j>^8O9!Eh< zIK64UchuJ&w2TZl)WR#4(msc!)@`dX!7HK8(yF%Ma0!p~z(7G|ZL_Iy)PYq@U@$Z? zWa}&TVPI+*stMTwO?}N4Nc8wS%X@n&{PrS9I6VbDgT3X&rC4`33foHR9p(OlR!G#l z3aW~R8wcG`du5h)b{4m|yUq6NO6R+3dQI*sN2#wEsPR_z_xMI!MFqgKuVJvOuQX8A zY4@UCHC7h(Hk;cjiw40uEQTVVfsb|PW{~0a3{n2zDKSwJPx-=IPe1yc_6F?RU5fSX z@?V(y@M}%4{JZg;Ex)bX@VC=KOFA#!NN7_>As>u~XZ}8wd1)%UZc5Z;64L=zbc12B z9XP<|sG$^la#>5V>7T{;XYrE7@N~i?DeVHLK{;a@JSO}wTAZ9)uE~rw(Tua(dV}3C zz!qxk?(1r;&#Je&fjEZQsAaZl*&?9LGXVzBGGT1V)O_8VH@|=R%PoA~w_K*ceFL8y z`oR-BwtYAM*yh|De);@uhdy}!!KQECcomSR&`i=}5u+Yoj1m}|wvmj68#CoZM$;4L zaHO+iBHTO~4q#xD*02@g&jKRO(hJTjT$wlYZ~4cC7pJe;`sx{rIXc&;0xQA6{epp5cM&`#Ya+ zSbF^E)O|aj+j0naCB4c;(fMjU&g6LX2->JefYC6#{f0$p@=2Tp*NcO?YHZVnh-xX& z=OSjGjya@b57VIpk&-+aRUjRY!Dok_44T@a9cQli)>UPvuj|e1bFb%XO*Q8ZeT{#i zcFM2*+V?Ad^ZwAw`&PX5@@fA)+JpbQgGr!beBG!=08$m>*daKmFrKffV%l;zu4PVY z*fts=uqNelI7@;+;YAIlsg?T>)CUoeu ztw;tFM-*7ZnAWfwoc>y3*yanS#?YzDcRh6G7scMfw;x`*`ORNXzuTYj$=}!iGG@ID+BRf^;I z*r>!;H}BqgYJ9)-$?=Bbo0og)?&R}sdNI;6e29ND z`#@yvk%#!YPq!eaN>Ln(8TI%ARqpmgQCt`ePtG)r;VNa~Be+OKo?g65Na_$@-s(~I{}7Uc1J=*hby3s(PTe8q1LL>?}BgwxFV?roW0nmz9L=xYa0 zteX*8@s&E>cg0|2!Acjm;Jtm3RlA;#jHP`tKE?lCz9GB$;Jtyv5!0EwBbsCXw>J)S_p_=-`FFDNKj8oyIhFpb{H6^vb|Ma+cQEmD!E7w@Cenc%thZJS`J5v-!S z?tx|@>?$cW4ys&ITBmrZ$n2Pc%pA##2d`9Ht!sE+-yulHfrT@iv(`9S;z; z`MawkceHdzJmDuI4;y~RHN5is@eiiiBbwrm4`l8y;^+4dMM{ldjOZR&&F}id&T-Aj z`TJjUX!z2vZHjFBpI5ksZ$8T}So#94*t*$p8?ILn;ySEX@aXZeQIEihpr*gk_NA%I z(u6sm3H<_$?y1=ajqS+=hLL?4b6N*$7=#53A}x6`x}dIW!6qCXCCwp&*M%_1TLc%* zi@*px&V^_OV9>V&d)veKRg7O{z_>zm#|anqpMS1EB(YT#LIpW$dq`h9p?!Zc! za88H-&g15Kx`7|$8At1JO2r1e5w^&*1DSTiaSkOV!8kePEvlrP+}0{w5|+A>XpDxX za4%E8^x#;(KfplZg?hNavNR)bp`x5{p#pJks0l8Z zMxj#;qo}7aetCf*g_>3ivoU6gvV@*e_EzYBXY6=(>Q`dCrJyewhSl{I3&k+tzMtw# zphHPt$U4jlktweV_zN>Cx7Hw2WQK@1l?T{U47}QaFdQ{iA>`gEsxAfgXxLR}59g3r z=9HFsJ36s9b3u#@;O~IeumzM%*T>ZpB_oIEX~-SUC81Sf7(q)#g`%aRlF@3>JJATx z06^)t+E5a$_(g0uASfCZo=Muv60f}ZbX%6tC0+@2Z94O%$DWR*x&1~9`?e(-9QQF7%}(v`C&4rvCq}!<15|G6)tn@`i`1`*BL6e8fCxX&9`pK`oTD)C}4j zfx;pV5_P8D9&=Yqbr=TB=Q}OcqoERat+@meojrZU^*(Qx9hKGA+0@6ESB&(aemX-n zr53KfF@*2%mS8>CTIlC-gtD-+vBEjh(9w_c3XP3*1Gc)l@?M;AC>km9ajw3`B0ngr ze4vGQwY65eY-G4`xO$YchuUj!K1v~`Wr(a8GemR>QBGVzB1SnCz0oR3tq`e?-cDx@ zr?b0ov>xUZW8#(51j^m1Gi-q=DmotYJvv3z6)5kn7%~cwe&hVaE7W4c3(95TVaN(x2x72<~_rerSOJ{@AvaXJ)^FENR-&Df!fi=(ovjD zDJu5W^tF%r-Dm<9tD~%VsJGPuy7f7F0=DXaH#lNPG5f0q3mtxcyEB}?Cq#V43=y5w zCxqUMP&=xKjaH3E(wIYOER3U}*b?IavAqlf|KJmJ0qIui6CU2*KipW;$fFh3dxwWR zISW^V2JQ~`x=Q)NUKfs_hs>6`q0VrFqZkr?le3O@+G@ISda1IgKQK~LSv7*wLyhg3 zO>p(};`~Ur#ZgjG>1eRw5Wc60FKzARJFC&FdYt~2){>_FVo=Zgpu4)edZ?(m3{8Kq znd=Li!oEs0{n46A&THu?ufaKlkj*jT?{ajLY2wiqcVTz8*%QRM+`1~0y&1;(!{`*% zT%gb5aQMn`uA<)@tZgf*tZj8cA~0YcE-xLd>4oVDU$f0-E2wZfdrDAsWep=GF8kn+ zZ8(94i1>^dVqpajk#@y|St}JS(nf$tu0pY?V+`!e;A#594b&uR@Pq0Xc$b_U37 zR!>v7f2htk+D-PQ;B1yR!1G1jHI+?+AfCn&M^{(Fa8Da*tGmeCKkVvn9Kq?Io>5z% zhOe}t6f@mLp89aJ%QcWb(^uFUDD#KiR!cPsk*jfa6;~8hTOA-o#kNI;2r1P@hKg8d z3q!3CXMW9~^~SCIMLnG51c0ikV5H+%s8L06z3rT#J6V_A1i}k;V9ow5enI{Ms4g1mA2G(1p{EizKTG5 zgUcDJ=*;7zlQBqS|35mWwzNDlo*xrubIuIOggW9^f z3YMQ_jtu+ji<=y6f%F^ZJL^i?3R|iJZa1KDmD$VOL9U?&+XThkff}Ew-5kVv%30LH z4_5ZqkT$_^k*l`4F<4Rp4(=PMsTwG%b+&OPWXIjx*wNYHb@V$y|D>my7i&>rPc=gn zB`kV~kxQIt`q}g0>3L$Wb<6s@*LtBf{i6+g-hh2~v6hyKrfD-vtq_<)9&=&}vv&%+ zE}cZp#FT+mG#M&|jf6031&n4d$xffDzhkj>s($eu=`%H1)2A+n$t?Pa>=J|dEs(*4 zO?c6nuk1S(OJ}}tf8g@mUCUqvq!9CEb6?rQ+}9gq-CGdpO_M?a3SrFF4qrGqg4^$4 z10F8A6t>@K$o4yE?+PQk@@b9idc~Qm62>jvf+s_wyvI)~K_eePjzdG62C@O<_)?%Q zTmw2`KcI}5&&@HdN&n>>(@?L*G4}T2I3_zF>55mn0}{vT(d1yCon{JEJXs?nUudAm zhekd2cZjQ{AJ?S+?NWt-Ed0yT2}Yr+EyWp-IBn78fIGFYB9QE+G)mAjvlmEGNKCx# z{u9hfab6;hQ0!^N2{~1>N-(+{Ln56z_{>I?V5CllggTLIq)uMuU(f63bfDCDrS6lQ zS9RyPQ~%WM$1k%jhf(_Fo38%V?OXbOyBuPsvh(0WqaI&Tbe`1cRvdno=_uZDMAYaw zVrtTklNynVL%uR1KbO-86`X7oB!)shF8i4sM`(AeBcjfeLb=c5K&kWnIh`nVzNsUr z^R4JiaeFUdaP8J6tY4b(s|CN>5;>H%&8Tfubfwg(R~%MUFeB9CYp3Lz$axWFGv-<>iCjl#u^4q6J8Gv{*M96V`dj**_a0!>-i*G6*5F(-5AsUA$p8EiA;lT}< zt&3*kkqMBE#}`Pm@uUIB#=`-m*|`lv^4H) zhHTu9EX~Gk!H|vHbfww2qZYDp8>}=Nx1vHeZVi=Y<5o<_CYv@Tl;B20$i@wUlF~%} zTFi**N$Udn4`W7DYFZvLhNQ2Q$um%8qtX(KAp?#gnZwMEDz!lNie#?~oWCP(g28v9 zgh+N_V3FAV6bT;51_LM}?mUnd5f`;X5plV>q=*)%l`P4|CsDE`>wK|DYyr09Wn42m z$B0W`F;lR0T&N1N#?__LY+Ta`*|@AynvH87AsZJeO0#ikAY|i;JZUyApo45&#)jE5 zqlSjd$PhNH-jbHarB;xQtD>aYxGo8@aY>Lg8`thYHZHS~X5%6k$j0?0(rjD`0@=6> z1G5E6aNz@FlpuB`WT0IIB)H;WNM^P`W8!p_gb>Mm z3KS71HKawvxd1344&Y0Rh-1!Z8nMA|NfBaK;`k~)iHb-v)`l;~#xo@^N9_uRm4wW= zVEGv`bf6b(45BjO4}wNh{|u1SUqM{hOM^e8Kld-jrELV;ARh6rM`sw?qq!?CGlZkJr!P13L~l15 zU}?kamMcfV82bsJU)fFJR~%AV+p+v^SY3HM`d zV*aL%m`vUZ!clV3;u{7HzUb{6!!ZHwzQgd%Xzr#J+F16Ad$h6q6Zebv_pB1{&ps%Y zKk$&4zvX*k{)$J%{1Y3+`#nF>8deIFZGB7|BW>??t)VMQ(1j<&{IySN4Z&#sQCK%4 z+@FHu3xxZVKi7(q+II-gl#0GFc0}^z)1%TSujB4_q1Xwn0k+9gi(K=z*3b~WUvWxn zz_Z-YQjff&joIg^KZ^M~|0L$W_h)U)UOqZ6=C6HMEPwnxG5_rQV*aiV#Qbv~iEz&T zL(ISUiCF%~B{Bcpow^vE`|j4o=v;lDE@oNh#w7UzrZgGezhS6PON06>C@`9ywkfR&}E2!O~g4uG=c6af&q#Q{*3nj!#@nFeGQcd`gN zg--dLiR`G+f@vxq2%s$3gy3=nA4ph&Q$eUK@Py!8D&UmGpAei?1)Q=n5`tSS6-WFo zM2Ya@A10Q(LfPbaBr=bGH?hDK%8KHNf0$U*3T2Z4N-X4fKocb_Ush*z3xdITC!34R zs+Ug%W86|Jq^y=yz(+&38UhlB)CwspEfp$Alu`q)tjbitlNhE3URlAZfREM+vT3oH zCZltasK?|*l(mRKRpmWec_c4HG?=`Ivi{WZ6YV7jt*kqB&_r{|K`ZM`9W-bSL!@jD zCZ;vXWSAHh)C()?Ff~LI>M3Qh7Ak5;3>Q^b5Sodl#3KarUuToZ`b(NBdRj&48${Z(Fwh9V~rd3KNRE~ArvZhl*Bni#x94YHP zHAKlO;TTV#41;NF-i{f8dORqR<_!UDWgcNW)&2I zju~do+&LFr>@MBh*g9+F-(i8HMW@MshlP3lXK>gq^ZNI6hD93otbQ)c$I#z$i4DS% z1t0Y1Qts2U>(byP81d&;cU4Kb-|6C;ROU<^lme z_6Q+*D-#2*^lJ%pv6KQ<2ph3qyVz)8&s+gm#UHr9qm{)JuAXD=drD_8GEY;iFVBf# zRT^aH2ut(_DNY#K^H%|I@kb~SxR@fqHv({mITdB;-(tX(SfUG}6j*}L11!-WfY>;f zJ$W@?6@T*qv5F~zbq`@_?W07HXTF>iD_zK>fEA)q>{k_D%Vf{b1FXq@R)Kk$U_D5& z-p3}zO4lhVV1)n``|*NL=d-)M0$9c0D?nbw6yfz0!Frxz-SSn@apH!$>Cw~F9-m&o zzIr`?7JpX&L5nE@`XW)@h7ClF?=6af{!HikpIylAxdAYWNBJX0F-0)0eHv}?2*tQ2 zD~2(CqC#qko0qV=7XxVVxP1gIrU>Zm0GeSwqM%RT7y}*847I~J@ky=lL=L+t8!(Ee z+apFXMKB&D%xwJu5#Oe~Bp9g;#$l9N;HsP0txEx;cw~J10rJdog7Gzq@%3AiV5IgJ zhY_t$IF|juE$qpBz$l)?jy#Gf!s7*k@ve=8$8)#MRXu%u-EETN*O{?rNVNm6L$z~$ zc`QsT&z4XFre~+GTh1Q7eZmp!%tiw27zK9G6a%JwOfwCjiDqRW-q2&3?=!J)a}$m* zXZ8?0?^8T~x5V%$6HN`39yPw+!fq=NfRV$-nIi<)y8k2+7%PeaQzn|4rXtbQJSov@ zZR~j~!6Tgb&7357_EJ3O?J9VvK`P=wqZCf*US((Z6i+y(oB4p?d4uBFU8;hInxP^d z8VD&r`!6`0Elpn#%oFzCf zP@E5UCc{ZhEgmPeGkO5+rVjR8JKC9Kxnp$OFLT$kXb4Yl)){g&?CNkbRMgtyQK6{` z$CbVnWDj={_QYkA@^e#}^?=jJoCTa4YXP zoWi;L4))wAP?Pu=PUbB_&Da(~&DfpV7&)qJQ~Rd}Wx^^ccH4gdro?AuGVc*xJxwuv zdXI=nm2v6~YK)_A2xptYN-cKxU0NVWJdRR+G$pfg9~#doO3+(llOTw?iW))aIKs)1 zuw0D&;65ZM+3AtYWHX4VklqmJqTvkKM5j5s7CG%FiEU?s|?ivnira z9~BX)a7{fTIoA{X;Zg91?;|m|1e+c@m?ya>x_*QZ^ARQHt^Z7!80rnliJ`tgPvV0m zkL>v$ATi?kd-137F}n#lTOT7zIItyUa;PUHCkOpNIOh&lU9zwK5Xnh=)E#pKh`E}1 zjS_R$HXtUtJtvjfUO>jc@2TREbg-(E-MLjOIhKw&MfkjH8)!y_gJsb`eld`LR5+#8gy2e~2By6E1yu4E+AqDQJ&C|2oU~MY9;t2#Toae?o zydN$6Fa>?jZV_|}+)*o6=PqXHaFiM>iDh5^DUu+ZrpD|bB%GlnM4p-y3DmaLNuc&k zk2`~vxa`TFY3DAJ9C^kZB1EkG3DNTF&rXU6YSHRMpj8VejlmLNcGEKxwEPw!VFxAQ zacbN2FeX?n%^rFY zF^Stb3sLIJI+nU=Xa=W_Q1g>y&f@U}>fyxSlP zHlXKZz_NXI$2(E92MO986zwBtM6~j*Kpa}?C&&k^{bvvVF$(ZF0r)Bfc;-(cKvh1d zJE-x2{y_PFZ3gTbKtRNIfuOxW(Vjdv3ABps`Z#XU?&mS|JQUc|z;6FDX;{Jfe7ZL( z{!A2R?7z@N*1{Qg(C6Fzmr0PM*shNw30;6n0yb8#Z=8=xViQ4o5YWP?-b?QS+61ll z7(n?cC$Kkzeg8rf++G6iZ3=GfUqx{8)~XB~wMtowOkO6&0c-(b_q`WId6b~M=cj~& zO@9+n%3I5O7;5xU(OK;8ZxE_L!Ul(Hdd13cD776Kh;?5LN#Y zMft|ZDWg8BTeizZT&4v3Wg<7#~tiUz3QhH_w?73nO+zZGh=XNmL z2oVuV#MW<3iU`HFu>ujIZNgSB_SLZ{{|5=`Qxx@~2Sn8JreO@^gf=l&d8;4Lu@_cG z;T=UFrlVHd5sTi1r6t1lfb%jdFdIAl(BxKwJVZaE>x~!Z%~7LQEHxwL!X~aERk> zA<8bDXKu3b##s2%d9Ikj}LN)Hc2sN|rT3 z+F^zA7QU0?N`|W8CHN&txaFacqs)W&Eb#tP9v&4BRYAJh0*{9I3Pe;1K$|U4-o*ou z;vrbR!gm%x`5YdoF5=4JQ9%KeZ{>kM=MWS!b;2FsgD=NWDSYRdp?r`7e)u6Lz^gVx zsR|DG;f6|}e1RG6mT*9SVVNDvo1pwn9MEqm1DSILDC;H;=`TZ3<)NtZIbhxDFN9k^ zilU1H`Z>Q9kmMn$C4hk|E(22dW+Y{lbD;bHl_|slyhc=$6V+vEMh#myC~5;hfDgZH z&cY>2hM^#EfnV_9%_s=?i(g0sS72^ivLv6&g>U+ZypIno$P@kW%{PK6p-wrgcd)rT zyS>GdKU!UtJ=A2aY;#oRcQ@Tw6t4ET+A3~zjWkv*?eUD{a+59JmkPZOH&D8yAUVBe z|D@^7YMvauP`=n{PeHyNevq#%cQC1ZJ9A4XDc@|{Kv151|Vq2Yq-Mn7f4 z1PP5g)c#OkW+yO7AI0oZioclXe=_*`9gCC&gZC}rzsct z9dLS*>N6Ytb<+9-ANvgYbbuO*oJ~pTDMNivo}OWlckfcLUn?<@#QqsT__-b3AU|S6 z)L!7FG6zX#4}iSzbsF8c!W8_?)jd3j^&$M z59E*(hyr|G)jz$VrW}a#P{&QWd|^jV3ielx_TTMP+=To9$j>4_c&EVKt1b_|6Z!Qz zp=wf;hiL@$3q*Vdc@Te`ggmgk*^y8ld=UBhZ1NZddGO{w1$ls>!H1YGvG*b5*Wm=w zQ+bf~1(e{qb}VG7#^)G$>4)luq*dCdKmga{ZQRbk{+xF;X{O`;vYp# zJ+3Zq4%GYT*YI&VgCJk>s?h#{`jY?@>rYDu=sUl98Q3S`4EtQ zI5~Q-9)-{3^hf+0BI9U(BKek}{d`XOLbn({)yu%%tmHPazTvk-mMj_V0HG@DyZ4_! z57v|LnVft_seFS@P@yu|pd9f&Vcdwxi%Jr8yDB+HwIQlU2s>CHDMr`JjBc~bZx z228em)=diWOD^9^w3lpmIH`OK&1mnF!e22-`BwN%xk#@CjYjNWNyaD473iM{#~UmP z;}g(UUg(5Tg1Gockcjb*??8Q3-vpvajGxOw{Z9%%_~c~Ew>fVDdPzWv#cOH*x!XAk z&+~u*rTUCl=)dO;bN0~zvQiK9}I1(ElU#Z~*Zu z)L&BnjO*`~J3;Aj`XfY(`m=U`{pR3rNr%BU;^=w}$jz6Kj zCF$=t+`#XWf;_Ot#QtdZqrXmCpIA>$TA!qSJ(QO;KJgCJ_vGn;dbrR%tnQyCVozPA zINpomi|bDXP+pKv+n9;_M^b-__NRz{wjxj!|CE$B=b=2*?7NWYdvpZ&BTh^835cTo z?Z!di2b){uX_Ef4#9oE|Hnxis_O}z1Z!c9E&y~fKVmH|T!p=OfFWMfC5(fk6BwcDp zeF4!V1rkrx-t&G!D!U{vCs_s<`3neQv%nV zy#H*!3HAHgjKQKBGKv1gNkS#)snMfdHvrk*=GW*mGV!prZ01 z@eKK={iR|z%0tK$Y@4K2{7dY=;kYe)fLrD9UWt9j^>^*&EWj_X?`emSm@<-LE<^jy zBK#r}GBJgJqy29#7xhi>W3F0%L6IIZ?hEl2-&bj`WQqN6+TZ01@)hEX2SG{t2VS(_ z9CxQ~78Z2pAfQYidV zFsdZ&Ya7uQYDL*zUD@D3<_>7I#rYRS5c4m-8tu1A1z(GZ59XP?gPt6ibL-BZFvrT_ z95%-%eqQWOfgUsQr(qR(a@FV|hj@+P~VPH91kTzQ|#}!iM=MlN9t3R z=Y^w4PvY@|q7L*QGF}1m+TBgr`Bc`@IZ)0tv=IEqibf>%PaQ6jCaOmwxGbbMpYWd; zpL0|NAMv9k^pNMx3&Fk;w~x8pe5|k0Ypm3!qP$7wG_n52@mJznG5MRxOOpDo&){9lUpywD9p#Yyzb5q*%Z z*984e=%doQN{xO}H<>$@DcCpBzc{|lSEIdDmBP?T68ml;_OBujT9>HNPec<`!8ei$ zKAdCZ%$*CtKh@;Vqx}rIq3e@G{w6c(M}>Y`cc{sq)D@PF&6eTA~oIF1Ep8yk6xw-;4O<@rt}8sV@R(f4Lpt zYE<8}JRpwt7xM7AyuZL>)i}|<#6B0)8*0x>MiHMpKG6C$)?PH{p}j9D1=CP$pYxq) zFEps2q^RxV8b_HwihW3^H`Jb2AWlVmrrb&6gL4UYf@G+74z!Ji>W=-9eEH9?b;6w8E z04JFrG?C}9J{jHJpjc&yVvaAnFhu$`GGNWe19Z zMZ--umh=I=F;rNS&Osa>_hnBnEwy|Yij%&O)2Nf{(m>IEcXBa literal 13171 zcmd5@eP~oy7Jo09NyeCL#%=EH#Kt9cbsGGf8I7ITRP!crUd*OxGHMbBX~}nDXOiq> zqQ)(y6p>P-EK-(IiY%qbB1M)W;$NjmDYAbQS&E31BBk^X|0qjYWRX($oO9ouxtY0d zk{I3XlX7vfumhx064KT229Iq{jH)o3!nHq7h^tQ?8-mR52dEE3uKj9o(*2C zME*dUX8=+<*+x|g*}BQoYaKcipf^eZIU;bd`xRf#2NmN}s0G>IX_K_~OgLV>uMLZ7z<1jHj%m;%%tZm%`RKPC9b{t~?CwnSbg~%R=IEZXQXBghAHQtA{A&gc7 zeTswcR?F~?*5>%XIzDx3Y?SO$*a?yY{F=#62hY@yKU4{MtqPKOE5Xa;~s*P^2lz|h~cyU}6Ra|A0b<_?SBp;i>D~!CS$^%e;H(CENsBR)m zAO%FUidi5LpUQ9lR9pG*VQp_4eyJEdHGZRFgWvEb>kQM}*DB*j1vOs?Y92r}@n1|1 zpuT-t0fpQtgQ9Fpac-A~c7_A++5vMKD4d`pQfV~oz@UM)tcbPaGnw>I7E&iOmwnYf z4B~v?51t9Wy`DlN&j<|QvJL^ufM*-L;qkX(L;cA#VT(k?gJ+Wlfy&nr_hB$pBfn$vt7hVeCc zEC^@K;H?P4b2bxz>IoGDJrGe5T*j1=@K!VVv7jXU&1A0}W5J~s|4LZ=E4$ku2&gJx zF-JyF6l&@W&xU|$noY0N8P(V+%Yq(~s0%J*05{0*oBR_qc>8$;E^vjXeqC9;IDLvg z5!m2!**%IAAXUM)vfHeOqX7P72auZ<{OrPx!%7EKx6T-FqVta;01qF~jenHcB*2s% zizNCSz!+^nh~-JnLp^Y>4S#Set#sptxjukfSEZ9cXOs>)L;x|FTfqptkY_a;bKghUQcU zgf&++c2JeN>eZu(PuLR&$=q?dRDi z)XzNiKeu*zGBYz-wik~3n+^4fXtUAXfNm~>?%b7*(F}9)cQSa;@5YmOvnL>SXk~&A z+|x*UWShTbS=0*|C2mJ#uWxf%K;CGesN8jfix_8Ik{c+BPNjII09$y}D0fN?BS+;( zNXxBO15~+iqA-MVe6x_9?y&2EwRoxndoe0Hy=qaRAr(|KG0#FlW^2h&DRmOUSH9uk zrvY9-G%vhcc!-ns$JyF_C7Hg|*+wH8o###>fM|tTd825@6IxBty87u$uyePzmYt1u z*N(V*KtElWplCC#IzzvhlQ&9kF3 z6_@!2_tiJl|GXiXX^;o%WnR4VcRE-dzq*(0FoK_#HOM1OuK2iLB_xo8LvUTSA(z zL0;O*qEAFF*Vz0%lK$rfG|$~NQet$a*VFvm0bp~n71{h{Eu6WRcNNV)Lb9)0(M+FJ z=YT8A3t?AooB1`QxK9+n)JS(ehWlgw6*7MRMbzLezhtZy^OD!X{Cgz-lF0x3+ngPX z*C$%hi@QNj*XvyV-~bEof4|7K7{N<*Io^fK7I7}@B)*#4S$;kaJTL4+%ircwUG)}o zrB|!`5_0@}9~rP-$8n|iqWn5?{E;|5STFSs6fTq$BKq4-ekTrI`EWm4?q-9;q+cmY zUSjfpA)_z%qjC>^Cj<3KT|Dx$HaeW7U4J&GxOT-_@(Ph(KxQAcu?XbY?IrKLRjT^#=brF?8r3~!aT3KVSzWXNgm z*7z?rM9M!c47oeoBv6-iFduW8D>MFHCvf_iz?|Kgi@6f}1*!lg#sRjx?$%s$@k^bc z{G}L~IGj^nzalSvUB#~>yW26g+X#NMOR|gV**((L6TgEJ&b4FYUECv0?IK|n_eT6D zlykWq4RfYt2|25{@Zs+q1UcWJ9RA>0?hE?(^$m4+C=XOyiNwnTd*?L0HPH^_A?r2l?6R!(qaefDX^mKxJ<*Bd zF_?eKItrwgjgw-)-?omxs}F#+??%?c7K(yFD>i~~IuMrK7lUya#UXfJJ`JF^!hmIs z<10qEfsiyzpkDC6OZJoA^m-okJx%@evT-KL!1=KQfb9t4C>?_nT{(tN>s!fNkB zV@_Cc?6s38M$+NQ(Rj=0pE5Yrgcn4Q61K$zwZ42hlNn!!qk78Uv zkf5koTIw+p=|4UkX?|@ml#7Ey{|kJ5psTPSnl82*UTFLk*^iP&1UuP}LLQ9cX1}du z4=}+5it2;ZMxi;qi{n3%oE@|Ku_#2)U%;9kpa-L_h@SlYGE%E}QGN-$E$HtBA<`j* z+SG?EmWrY{UaD#(@P-gC%@_MP zXugKH4=(Bd(|P-@V!iCu+h_EMh>sS^Z_I@M=YEWKiFRrp{5XDSpgBDpZV9XVFYUP4 zZ;?M0{8VZ6{-CM!dffX-4Zcp7{S~=#Na)x#oKBt?ggQiccF}%d23-Nc3I4#B%X&px z;%n9WDTA>B^E%h zX@sKR4`L|9W`z$#>>Wwfj<5ET4KJ)FH}mbToliEMs$m39O=zKrHn$c0O~ z_CK(Xv@9py?O{7k@j-Yalz+%cgfKwFVmdT8KOC^-nr zhKR?)^)iMq!B}v1p~DlLGf4$!Gt^PIUeYdSWUzm7*iMDUocLP>ezNCL!q_zr=ff1* zkLy6~~neJam4m2N6;y3BvLiKSFg^2n>dV>B-pRZSehfHdx@H~{?zd7-l zwEO2r?G*Tl{P(!eJG6Bev0F|L4JKxXBb4r|w=af>-7_ -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##FILTER= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= ##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##SAMPLE= -##SAMPLE= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT first second -chr1 100 . A T 50 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_RString=ref,alt;Info_G=1,2,3;Info_u=0,1,2,3,4 GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 1/1:44:1,2,3,5:testA:r,a:1,2,3:0,2,5,6,1 -chr1 200 . C G,CG 60 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42,43;Info_RChar=r,a,A;Info_RString=ref,alt1,alt2;Info_G=1,2,3,4,5,6;Info_u=1,6,3,4,5 GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:2,4,6,8:testA1,testA2:R,A,B:1,2,3,4,5,6:0,2,4 1/2:45:2,1,6,8:testB1,testB2:R,a,b:1,2,3,4,5,6:0,2,4,5,6 -chr2 300 . G A 70 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_RString=ref,alt;Info_G=1,2,3;Info_u=0,1,2,3,4;Flag GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 \ No newline at end of file +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_0 sample_1 +YAR028W 509242864 . a ATg 6 Filter_0 info_Flag_0;info_Integer_1=-1867486102;info_Integer_2=1180908493,1041698941;info_Integer_A=-207506013;info_Integer_R=-1221871784,-1356802777;info_Integer_G=-496257853,2127853583;info_Integer_.=2082620030,-344161839,-1022296779,-1007334133;info_Float_1=68.286865;info_Float_2=-96.154594,-23.433853;info_Float_A=-48.782158;info_Float_R=-46.15216,-92.639305;info_Float_G=-7.5115204,74.78337;info_Float_.=26.825455;Flag_0info_Character_1=i;info_Character_2=r,[;info_Character_A=g;info_Character_R=M,D;info_Character_G=h,w;info_Character_.=G;info_String_1=p]ZoXMTgQo;info_String_2=uVGn`JweVD,DUYytzAny[;info_String_A=_POshsqbSj;info_String_R=AdbZcRFrrQ,_[VS^RtSvz;info_String_G=MeTjonYVIn,jLIi`oWogn;info_String_.=CzkT\Wk_sG GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1876597949:706761235,-251943823:394859496:-1947058767,424473864:1331697703,-73747609,1645597043,-1553292366,-1685240226:300184417:18.381859:55.763123,-25.909782:-23.853012:-65.84661,-26.444412:12.577988,-87.76228,-3.4822464,-95.66553,55.56636:-35.16729,6.755356:H:Y,N:m:[,Q:B,C,g,L,`:c,x:xXYm`NnOG[:K`QKgogYxZ,uNAMyDqpgZ:liSmUzRvGG:XBgqxa[aBw,_ZxxkAFA[o:`OIdJgjZDS,tKauvtaIhw,mmrIgNXcbh,Rd]QWyFOgu,kSjBlBKigq:znOIm[gGXi,[j\RlwOmAi 1/1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +23 1165400956 . T t 199 . info_Integer_1=-597222189;info_Integer_2=446843965,1432841503;info_Integer_A=-1756403175;info_Integer_R=-1210584642,1067164582;info_Integer_G=-2026752623,1524204480,2063402043;info_Integer_.=387204105,-2048329790;info_Float_1=29.60765;info_Float_2=-70.24462,91.82048;info_Float_A=-57.780792;info_Float_R=-19.511703,87.46164;info_Float_G=17.362617,-10.059616,-89.640594;info_Float_.=82.884,-31.403328,-83.54941,-54.887726;Flag_0info_Character_1=];info_Character_2=Y,R;info_Character_A=p;info_Character_R=A,W;info_Character_G=i,y,I;info_Character_.=w;info_String_1=]gp_[s]vDh;info_String_2=Y\SmynkIV^,tOuGkqHsiE;info_String_A=QDdbnppEhM;info_String_R=VgQkWCCgEH,r^aAgT^sOf;info_String_G=z\_iwMGBRH,EQy^RJwkWd,gu]hpIwaVj;info_String_.=zxs[sGNNuy,cmnjXNUPka,QaFrhEZaIB,_TjXJMdWCM GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-871261733:-1500509753,2025272017:1864754769:-1127684339,-447878996:-1851298122,1367475939,1988967275,-439362500,-447904679:-907504720:-73.56331:-86.60319,-31.910011:6.785797:-95.413086,19.286415:9.942863,-23.623634,31.06224,42.57071,92.734314:-51.402973,-25.126984,73.030045:J:K,Y:K:c,^:y,P,V,b,S:U,w,M,X:IKZ]ZMDszw:apRf\BVTcU,UOJHFcgkaj:fIjt]RZCsd:TtoRPBHoRS,sDF^wkt`MK:boQ]OQxmec,eJfBqcdaUg,To]BkSYKbI,J\qQxtjZBq,\nQWJTeYEf:tHujPdNde\,EWFfR`mig_ 1/1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +ENA|LT795502|LT795502.1 525786811 . A ATgA,CC 226 . info_Integer_1=1004917273;info_Integer_2=-1087925856,-1111801609;info_Integer_A=1142924498,51351;info_Integer_R=397636772,575245484,131531;info_Integer_G=631457844,1508219739,2060178753,1314135;info_Integer_.=915360277;info_Float_1=81.456085;info_Float_2=-97.36381,99.07503;info_Float_A=-17.968132,-30.1514;info_Float_R=23.030853,17.895386,16.51314;info_Float_G=36.786133,-36.816742,-79.92742,304.151351;info_Float_.=-56.670807,-88.687706;info_Character_1=x;info_Character_2=f,X;info_Character_A=Q,R;info_Character_R=M,v,p;info_Character_G=A,B,I,K;info_Character_.=s,S,a,E;info_String_1=RULoNvUdVj;info_String_2=YlKPytYpDY,hwIe\Lokil;info_String_A=\VZSHlparH,bonjour;info_String_R=PVoBxilKPl,_s`t`swTzf,il en faut peu pour ĂȘtre heureux;info_String_G=FiINYvUJIO,LtzFxYFFJp,mMeZaQtSZU,rFHUKkG]FO;info_String_.=hoXwhfvniY,A\txGAVbNp,SbcLeVnkYI GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1992801692:305092475,-1179215612:-1739296736,1531141442:-388696167,-659498070,13514131:1503808064,-183677921,-849522112:-498574773:14.566895:10.018013,-16.879608:99.71886,31.141592:0.29120636,-9.131622,-2.71828:80.328705,-26.403976,-37.213943:58.0925,-70.47134,-59.34813:X:N,q:S,M:a,_,u:c,W,U:m:^QkYoWratI:ryX^JlAyCx,WGZntpNsOo:FRryqZFoMj,fromage:gCO^BOI[ml,VJqiy[VWym,ah bah non:nbtqw^\zmA,ZiBBJm[Vbv,aNMll`xnfr:rHzB`UssLW,apsPd_lrip,Uih`ROsUql,tnBQQdhtwm 1|2:.:.:.:.:1,2,3,4,5,6:.:.:.:.:.:1.4142,3.141592,2.71828,1.618,273.15,6.67:.:.:.:.:.:a,b,c,d,e,f:.:.:.:.:.:AA,BB,CC,DD,EEe,FF:. +ENA|LT795502|LT795502.1 1506498921 . A gT 99 Filter_0 info_Integer_1=1074860489;info_Integer_2=-6784655,1952022752;info_Integer_A=-1765522773;info_Integer_R=1316333577,-554518728;info_Integer_G=-440746192,417172829,1208578807;info_Integer_.=-67150747,-701563860,1708267257;info_Float_1=-85.47166;info_Float_2=33.09308,37.761444;info_Float_A=99.544266;info_Float_R=-4.276779,27.070168;info_Float_G=-93.02027,-68.755196,-18.597626;info_Float_.=45.63858;info_Character_1=R;info_Character_2=B,W;info_Character_A=l;info_Character_R=z,E;info_Character_G=Z,h,F,D,D;info_Character_.=n,G,v;info_String_1=ojZkSfujYX;info_String_2=`ZrZJtq_hx,StcGnLjWNS;info_String_A=k[feQ[mqyE;info_String_R=Grr[rGo^md,GkiXanc\K\;info_String_G=Yhij\pOPji,yYlsCnJSCY,VggsEuC]ad,G^jiYbvsbn,IJmJvG`jzs;info_String_.=CKtxVQFr]_,G^jAnQaGyI,yvzleXG`vO GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-2039921838:1260782784,-2144365597:1110295788:-158846729,1495837063:390766793,1114219927,-790406568,1652554877,-1144133980:-610126444,1736640977:0.7073364:-56.6103,13.725807:-8.711052:-35.14733,55.534668:-41.20214,-69.47873,15.234543,1.8139114,-81.88782:-86.727234:]:f,Q:I:u,_:b,k,C,[,T:b,S,O:hGS[GPZUZu:uQuJZtwYq_,SyDvF`v_[[:ol]TtBXxVP:mFMNUVM`Ir,XbBPeoBkYj:`xGS_`zgey,v\]bxaFPdJ,lGqnBWyHQI,ynpDFGuSsm,^bGxsDdgIl:q`HG\bwqSl,GrnuUSzgVy,cRSPjljk_Q,TWbW]MISyd 0|1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +chrMT 900574305 . c a 208 Filter_0 info_Integer_1=-523627641;info_Integer_2=828853617,538841733;info_Integer_A=-1070289656;info_Integer_R=1177092376,1248528320;info_Integer_G=1338006213,-939491184,2031520519;info_Integer_.=1020802949,-325766450,-1975174725;info_Float_1=86.21178;info_Float_2=55.274292,57.992126;info_Float_A=98.6465;info_Float_R=-56.676746,-78.452255;info_Float_G=-15.1058655,-32.05681,-23.85817;info_Float_.=-8.955811,49.62912,72.4005;Flag_0info_Character_1=x;info_Character_2=B,r;info_Character_A=i;info_Character_R=\,D;info_Character_G=g,v,l,W,T;info_Character_.=g;info_String_1=j`zU\K`PLc;info_String_2=l_rhk[Kr]b,rFB_aSUBR`;info_String_A=ZQYIsGAof_;info_String_R=HwrUBliWel,scCvWxgE[r;info_String_G=kY`HMgmH_p,IUvLZ]sdba,nVQJ_Fh`ET,QnYUFz`ShS,KXwJfcYmsw;info_String_.=Q]wexXkHyr,fL\NGDMlkW,jVpWnME[tp,Zz\hrFyx`] GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1648034443:845207498,1438334805:-821666363:-532302872,-784878946:-1660896800,2008926111,1279825538,-233248668,-1578146061:-1728381686,-1354962949,-2095339305:4.250908:1.8742523,-78.206894:-22.634148:-61.518906,13.15873:-64.15179,17.086502,-25.609276,57.059555,-92.1911:26.885536,-27.22528,-51.00875:F:l,e:M:y,e:^,X,g,x,h:K:LXIC\KLZsD:Nd[vIMiHJA,^ConDldYtT:CBqgBJnzRq:nOgdeNbqKd,rTYmYwJcQu:WMlQs]gO[a,Kk[sJ[UoxT,HT]XWH^ZTF,IIJXSmrLHg,qo`OJ_hgav:tSEKSQWXRL 1/1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +X 508903144 . T taAG 107 Filter_1 info_Integer_1=-442012684;info_Integer_2=1242798393,-893635990;info_Integer_A=-1049853993;info_Integer_R=242988245,-245551581;info_Integer_G=992362638,-556141956,-1436766801;info_Integer_.=-1890267838;info_Float_1=50.181763;info_Float_2=5.1533203,32.221054;info_Float_A=31.930801;info_Float_R=18.487122,-4.3887863;info_Float_G=-98.10066,-69.57614,91.27092;info_Float_.=-46.815468,-65.40532;info_Character_1=_;info_Character_2=C,c;info_Character_A=w;info_Character_R=v,g;info_Character_G=v,Q,r,`,W;info_Character_.=\,j,W;info_String_1=Ol_Gd[f\tt;info_String_2=DX_Nqhsbft,ZvYZmhftHw;info_String_A=bXt^\wzwfQ;info_String_R=ANOP[Zjcef,[mLDzYe^Xa;info_String_G=i_XoxRH]Up,N\qKskBEfm,vceQjVrtTu,_LnQ_[ngn],yd[ZFNmECq;info_String_.=FPpMF]TI[C,skPWfxtNBS,uEV]cCazM[ GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1784107859:-939663,-275551415:1120891123:-769476243,636006815:-1200543946,215752478,-1484326861,-561668774,-176043456:1567908440,-504633662,-461948338:41.60312:89.26175,-28.585121:-25.16079:-93.47296,-97.84482:-64.203835,62.569122,-16.536377,24.606033,-83.769844:-0.48690033,50.683716,-61.86049:u:p,x:G:O,q:J,g,s,K,g:c,l,a,w:mgLADffOAW:OFYzsNT]DN,knpK_\ZlLw:NIGi]zaPz\:vcAptCt\VT,z[VAZFjS[p:tcGuXLiEpv,fCLE[^Bzbh,rkGwaRnhxi,fTMvHtFRwN,RPYaXfd^BF:DOnEKUN[]V,ppBNNWrhhK ./.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +23 2057099842 . g a 105 . info_String_1=ukwTtXgA^` GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:883085881:-1163194169,396734570:-864830302:-1015935718,416512239:1055659899,378650980,1719415308:1562372336,38255854:-96.96286:-22.160027,2.9108505:-76.4101:-31.626724,74.70122:-10.465622,92.86656,66.5076:-35.273743,71.3008,-21.861176:N:Z,x:w:D,Q:g,[,i:b,u,q:rOrdrrnAAf:fkbpBSUynu,ks[PFA_asT:JK\[phzgTx:sIMyglMmy[,zdTHmmDXr`:mJZqxRqayy,nzIi[BVIeR,TJX_tcRRuR:urxfXYFYWh,yxg_KbMfiV,O^NfHdYiTT,RQLQTrA`[D 1/0:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +NC_000015.10 278483743 . c Gc 68 Filter_0 info_Integer_1=-1771300013;info_Integer_2=611485162,1796725452;info_Integer_A=971438374;info_Integer_R=698255143,905472298;info_Integer_G=-200904731,1733482657,-1601571925;info_Integer_.=1226331110,1800309665;info_Float_1=-8.41539;info_Float_2=-91.844246,33.56476;info_Float_A=-70.37154;info_Float_R=-88.31048,49.067856;info_Float_G=-56.826736,-72.017075,58.757156;info_Float_.=-93.33689,-11.933228;info_Character_1=V;info_Character_2=F,H;info_Character_A=];info_Character_R=],k;info_Character_G=b,j,c,v,k;info_Character_.=W,J,`;info_String_1=[sg[NfQUjS;info_String_2=tuIDx]qY`n,sYihzjCcDX;info_String_A=rWeIJoLqif;info_String_R=Lc\nOEn`SI,ohc]_`UFau;info_String_G=ts\z[cGhVY,FWhsZospCl,EibJWC`AtQ,mNRPCdUKvw,lVzUvcofUf;info_String_.=ivsywIpK\E,dw[sIibpcF,ngkSonUgLJ,OXEnvoSKPb GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:1053018922:-184938975,-1760026633:-857964358:-2102716665,1815665606:-1175872690,-347280558,1231790968,-790356303,-222139945:-1345714777,160079922:-82.493515:64.684265,-99.60785:-19.693016:-3.5498886,-99.20609:-5.4513702,-81.74608,-6.946541,-87.49165,-31.873795:-0.010063171:x:n,^:R:b,h:h,c,],\,X:I:LWSccGJLM_:BFr_AeHgBF,HkaUOy`jqz:FGMQKwoeFA:^WCLcfxttz,YAipFMM\Sa:WfUeg^ehSy,Rn^uqvYSmS,sEKSkLrC\p,QDmf[JGzbG,QiZIx`^pZN:Kmi[\ChDrN,OJdBvVq[Af 0/1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +NC_016845.1 1273217582 . A cA 185 Filter_1 info_Integer_1=1702504238;info_Integer_2=-1300020074,-1771363986;info_Integer_A=-666582393;info_Integer_R=-1483769984,-1241578554;info_Integer_G=1976807172,-1260807615,-108510257;info_Integer_.=1829528682,-928482172,-429726805,-2007283327;info_Float_1=70.20416;info_Float_2=94.22778,49.014664;info_Float_A=77.67261;info_Float_R=69.01376,-85.50122;info_Float_G=22.049858,-31.612656,67.47859;info_Float_.=-79.06835,36.144714,-11.66687,-33.392593;Flag_0info_Character_1=Y;info_Character_2=s,w;info_Character_A=Y;info_Character_R=E,S;info_Character_G=J,C,C,^,N;info_Character_.=s;info_String_1=RnqfrhRxGK;info_String_2=QCczZsqSMX,UadliszTvD;info_String_A=UeAmTxgIJs;info_String_R=RpC[yfli[m,UMXScERIAT;info_String_G=Piyg[YSyn],wrVNLOsrsd,CuaYlZzSG`,hHfxMnZBYb,fELmQbwhQV;info_String_.=MHfN_MSgEe,_REWJxavTD,SkkKdTmDLG,R[oCTWMP\K GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1879771452:-2024174827,895533931:25454566:-1413372025,-1730678484:1651476894,522323445,-801323168,-692607812,1081910993:-1359195475:64.78084:98.23514,-95.097374:97.43535:-65.953636,7.431885:74.098724,-84.2887,-56.68762,-86.44216,96.5443:32.076004:K:G,\:G:W,z:h,o,j,X,Y:e,G,x:nbjlHj^`q]:aa^uI^^SoQ,JyJ\ARpaJg:MniIYiZryL:qDr]fJV]eR,iuDxSPv[oY:qLjrYY]bPA,rejiDo^By[,`mxXgnjkPa,_grwZxX`kA,\OLO_zFEeT:xJR]YarNNn 0/1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +ENA|LT795502|LT795502.1 566884162 . t c 22 . info_Integer_1=-63306296;info_Integer_2=1391506844,-1503768112;info_Integer_A=340548256;info_Integer_R=-1286314818,288781403;info_Integer_G=-800469678,-1311787939,-793948174;info_Integer_.=-1341990003;info_Float_1=-76.227356;info_Float_2=-54.977512,-39.39898;info_Float_A=-35.61332;info_Float_R=-70.32056,-42.79394;info_Float_G=67.78093,43.006317,92.26671;info_Float_.=-60.336803,-45.87288,92.96947,-43.244385;info_Character_1=[;info_Character_2=c,J;info_Character_A=R;info_Character_R=o,d;info_Character_G=h,`,F,\,q;info_Character_.=T;info_String_1=q^HZe_mW_C;info_String_2=FPSDvSVXAd,YbrjDSdRXm;info_String_A=IxDTHZYoq[;info_String_R=OsOWlbXzO\,hAhG_b\Ifw;info_String_G=jb^GYiHZRT,_[`_aqmUIf,PtWWNPUINQ,WkqQaaxSee,jRMUC_IYwu;info_String_.=ZVqn\yRJEI,`vlpPiWkLZ,aVHocDfVJv format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 389250658:-1173892904,-995837010:380428736:-350796083,-1946061625:-1985077526,-956832721:212134446:-93.3871:73.895645,-82.49681:-59.703255:-53.21877,-11.0794525:98.62854,-40.406464,36.850067:-80.0885,25.734207,92.746826,24.650955:Q:l,w:j:t,Z:K,X:i:VAXYF^LWPG:SudBRfeYRI,axYzALsh[m:gWvHMgghOt:cIIIEUOOnN,Q`yNRLvwIx:HeiQgtTGFY,A[RlKUJYGM:u[WuJ]OfAC,ToajkjZMqO .:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. \ No newline at end of file diff --git a/tests/functional.rs b/tests/functional.rs index 6114244..f212c1e 100644 --- a/tests/functional.rs +++ b/tests/functional.rs @@ -25,13 +25,22 @@ Commands: help Print this message or the help of the given subcommand(s) Options: - -i, --input Input path - -b, --batch-size Batch size (default 100,000) - -c, --compression Compression method (default snappy) [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd] - -r, --read-buffer Read buffer size in bytes (default 8192) - -I, --info-optional All information fields are optional - -h, --help Print help (see more with '--help') - -V, --version Print version + -i, --input + Input path + -b, --batch-size + Batch size (default 100,000) + -c, --compression + Compression method (default snappy) [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd] + -r, --read-buffer + Read buffer size in bytes (default 8192) + -I, --info-optional + All information fields are optional + --parquet-version + [possible values: v1, v2] + -h, --help + Print help (see more with \'--help\') + -V, --version + Print version " } else { b"Convert a vcf in parquet @@ -44,13 +53,22 @@ Commands: help Print this message or the help of the given subcommand(s) Options: - -i, --input Input path - -b, --batch-size Batch size (default 100,000) - -c, --compression Compression method (default snappy) [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd] - -r, --read-buffer Read buffer size in bytes (default 8192) - -I, --info-optional All information fields are optional - -h, --help Print help (see more with '--help') - -V, --version Print version + -i, --input + Input path + -b, --batch-size + Batch size (default 100,000) + -c, --compression + Compression method (default snappy) [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd] + -r, --read-buffer + Read buffer size in bytes (default 8192) + -I, --info-optional + All information fields are optional + --parquet-version + [possible values: v1, v2] + -h, --help + Print help (see more with \'--help\') + -V, --version + Print version " }; @@ -70,6 +88,7 @@ fn convert() -> Result<(), assert_cmd::cargo::CargoError> { let parquet_path = temp_path.join("tests.parquet"); cmd.args([ + "-I", "-i", "tests/data/test.vcf", "convert", @@ -107,6 +126,7 @@ fn split() -> Result<(), assert_cmd::cargo::CargoError> { let parquet_path = temp_path.join("test_{}.parquet"); cmd.args([ + "-I", "-i", "tests/data/test.vcf", "split", From abfda05519ddc371c70f6a50063e2efe24416eb2 Mon Sep 17 00:00:00 2001 From: Pierre Marijon Date: Thu, 18 Apr 2024 16:30:12 +0200 Subject: [PATCH 6/8] fix: format and remove debug instruction --- src/name2data.rs | 74 ++++++------------------------------------------ 1 file changed, 9 insertions(+), 65 deletions(-) diff --git a/src/name2data.rs b/src/name2data.rs index d0d0241..defeafb 100644 --- a/src/name2data.rs +++ b/src/name2data.rs @@ -149,11 +149,6 @@ impl Name2Data { *array_val.get(alt_id + 1).unwrap(), ])?; } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); column.push_null(); } } @@ -202,11 +197,6 @@ impl Name2Data { *array_val.get(alt_id + 1).unwrap(), ])?; } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); column.push_null(); } } @@ -261,11 +251,6 @@ impl Name2Data { array_val.get(alt_id + 1).unwrap().clone(), ])?; } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); column.push_null(); } } @@ -336,11 +321,6 @@ impl Name2Data { ), ])?; } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); column.push_null(); } } @@ -407,28 +387,13 @@ impl Name2Data { if let Some(format_field) = record.genotypes().get_index(idx) { match format_field.get(key).flatten() { Some(value) => match value { - - noodles::vcf::record::genotypes::sample::Value::Integer( - value, - ) - => { - column.push_i32(Some(*value)); - } - - noodles::vcf::record::genotypes::sample::Value::Float( - value, - - ) => { - column.push_f32(Some(*value)); - } - noodles::vcf::record::genotypes::sample::Value::String( - value, - ) => { + noodles::vcf::record::genotypes::sample::Value::Integer(value) => column.push_i32(Some(*value)), + noodles::vcf::record::genotypes::sample::Value::Float(value) => column.push_f32(Some(*value)), + noodles::vcf::record::genotypes::sample::Value::String(value) => { if key.to_string()=="GT" { - let mut gt_str = String::with_capacity(32);//Arbitrary capacity + let mut gt_str = String::with_capacity(32); //Arbitrary capacity if let Some(gt) = format_field.genotype().and_then(|g|g.ok()) { - eprintln!("GT: {:?} ({:?},{:?})", gt,record.chromosome(),record.position()); gt.iter().enumerate().for_each(|(i,allele)| { let (position, phasing) = (allele.position(), allele.phasing()); match position { @@ -446,10 +411,10 @@ impl Name2Data { } } if i < gt.len() - 1 { - gt_str.push(match phasing { - Phasing::Phased => '|', - Phasing::Unphased => '/', - }); + gt_str.push(match phasing { + Phasing::Phased => '|', + Phasing::Unphased => '/', + }); } }); } @@ -510,11 +475,6 @@ impl Name2Data { *array_val.get(alt_id + 1).unwrap(), ])?; } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); column.push_null(); } } @@ -564,11 +524,6 @@ impl Name2Data { *array_val.get(alt_id + 1).unwrap(), ])?; } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); column.push_null(); } } @@ -622,11 +577,6 @@ impl Name2Data { array_val.get(alt_id + 1).unwrap().clone(), ])?; } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); column.push_null(); } } @@ -690,11 +640,6 @@ impl Name2Data { ), ])?; } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); column.push_null(); } } @@ -731,8 +676,7 @@ impl Name2Data { unreachable!("{} should be in schema", key_name); }, } - } - else { + } else { todo!("Understand how we could get there (the tests never did)"); } } From b3b7231d05bda9ac97626c57eff1a767552ceb22 Mon Sep 17 00:00:00 2001 From: Charles Monod-Broca Date: Mon, 22 Apr 2024 11:39:27 +0200 Subject: [PATCH 7/8] Flattened nested options Unified add_info and add_format implementations --- src/name2data.rs | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/name2data.rs b/src/name2data.rs index defeafb..696003a 100644 --- a/src/name2data.rs +++ b/src/name2data.rs @@ -89,26 +89,24 @@ impl Name2Data { let key_name = format!("info_{}", key); let info_def = header.infos().get(key).unwrap(); if let Some(column) = self.0.get_mut(&key_name) { - match info.get(key) { + match info.get(key).flatten() { Some(value) => match value { - Some(noodles::vcf::record::info::field::Value::Flag) => { + noodles::vcf::record::info::field::Value::Flag => { column.push_bool(true); } - Some(noodles::vcf::record::info::field::Value::Integer(value)) => { + noodles::vcf::record::info::field::Value::Integer(value) => { column.push_i32(Some(*value)); } - Some(noodles::vcf::record::info::field::Value::Float(value)) => { + noodles::vcf::record::info::field::Value::Float(value) => { column.push_f32(Some(*value)); } - Some(noodles::vcf::record::info::field::Value::String(value)) => { + noodles::vcf::record::info::field::Value::String(value) => { column.push_string(value.to_string()); } - Some(noodles::vcf::record::info::field::Value::Character(value)) => { + noodles::vcf::record::info::field::Value::Character(value) => { column.push_string(value.to_string()); } - Some(noodles::vcf::record::info::field::Value::Array(arr)) => match arr - .clone() - { + noodles::vcf::record::info::field::Value::Array(arr) => match arr.clone() { noodles::vcf::record::info::field::value::Array::Integer(array_val) => { match info_def.number() { noodles::vcf::header::Number::Count(0 | 1) => { @@ -334,11 +332,6 @@ impl Name2Data { } }, }, - None => { - unreachable!( - "Since the outermost option is Some, this should be unreachable" - ); - } }, None => { if info_def.ty() From e381029307baea22739b84d1698647d3ddb17d50 Mon Sep 17 00:00:00 2001 From: Charles Monod-Broca Date: Tue, 9 Jul 2024 18:22:11 +0200 Subject: [PATCH 8/8] WIP: Fixing unwrap bugs... Not sure what style to stick to, open to suggestions --- src/name2data.rs | 82 +++++++++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/src/name2data.rs b/src/name2data.rs index 696003a..f997f82 100644 --- a/src/name2data.rs +++ b/src/name2data.rs @@ -119,12 +119,12 @@ impl Name2Data { column.push_veci32(array_val)?; } noodles::vcf::header::Number::A => { - column.push_i32(*array_val.get(alt_id).unwrap()); + column.push_i32(*array_val.get(alt_id).unwrap_or(&None)); } noodles::vcf::header::Number::R => { column.push_veci32(vec![ - *array_val.first().unwrap(), - *array_val.get(alt_id + 1).unwrap(), + *array_val.first().unwrap_or(&None), + *array_val.get(alt_id + 1).unwrap_or(&None), ])?; } noodles::vcf::header::Number::G => { @@ -132,19 +132,19 @@ impl Name2Data { == (allele_count * (allele_count + 1) / 2) { column.push_veci32(vec![ - *array_val.first().unwrap(), + *array_val.first().unwrap_or(&None), *array_val .get((alt_id * alt_id + 3 * alt_id + 2) / 2) - .unwrap(), + .unwrap_or(&None), *array_val .get((alt_id * alt_id + 5 * alt_id + 4) / 2) - .unwrap(), + .unwrap_or(&None), ])?; } else if array_val.len() == allele_count { column.push_veci32(vec![ - *array_val.first().unwrap(), + *array_val.first().unwrap_or(&None), Some(0), - *array_val.get(alt_id + 1).unwrap(), + *array_val.get(alt_id + 1).unwrap_or(&None), ])?; } else { column.push_null(); @@ -167,12 +167,12 @@ impl Name2Data { column.push_vecf32(array_val)?; } noodles::vcf::header::Number::A => { - column.push_f32(*array_val.get(alt_id).unwrap()); + column.push_f32(*array_val.get(alt_id).unwrap_or(&None)); } noodles::vcf::header::Number::R => { column.push_vecf32(vec![ - *array_val.first().unwrap(), - *array_val.get(alt_id + 1).unwrap(), + *array_val.first().unwrap_or(&None), + *array_val.get(alt_id + 1).unwrap_or(&None), ])?; } noodles::vcf::header::Number::G => { @@ -180,19 +180,19 @@ impl Name2Data { == (allele_count * (allele_count + 1) / 2) { column.push_vecf32(vec![ - *array_val.first().unwrap(), + *array_val.first().unwrap_or(&None), *array_val .get((alt_id * alt_id + 3 * alt_id + 2) / 2) - .unwrap(), + .unwrap_or(&None), *array_val .get((alt_id * alt_id + 5 * alt_id + 4) / 2) - .unwrap(), + .unwrap_or(&None), ])?; } else if array_val.len() == allele_count { column.push_vecf32(vec![ - *array_val.first().unwrap(), + *array_val.first().unwrap_or(&None), Some(0.), - *array_val.get(alt_id + 1).unwrap(), + *array_val.get(alt_id + 1).unwrap_or(&None), ])?; } else { column.push_null(); @@ -216,14 +216,28 @@ impl Name2Data { } noodles::vcf::header::Number::A => { column.push_string( - array_val.get(alt_id).unwrap().clone().unwrap(), + array_val + .get(alt_id) + .unwrap() + .clone() + .unwrap_or_default(), ); } noodles::vcf::header::Number::R => { column.push_vecstring(vec![ - Some(array_val.first().unwrap().clone().unwrap()), Some( - array_val.get(alt_id + 1).unwrap().clone().unwrap(), + array_val + .first() + .unwrap() + .clone() + .unwrap_or_default(), + ), + Some( + array_val + .get(alt_id + 1) + .unwrap() + .clone() + .unwrap_or_default(), ), ])?; } @@ -232,15 +246,27 @@ impl Name2Data { == (allele_count * (allele_count + 1) / 2) { column.push_vecstring(vec![ - array_val.first().unwrap().clone(), - array_val - .get((alt_id * alt_id + 3 * alt_id + 2) / 2) - .unwrap() - .clone(), - array_val - .get((alt_id * alt_id + 5 * alt_id + 4) / 2) - .unwrap() - .clone(), + Some( + array_val + .first() + .unwrap() + .clone() + .unwrap_or_default(), + ), + Some( + array_val + .get((alt_id * alt_id + 3 * alt_id + 2) / 2) + .unwrap() + .clone() + .unwrap_or_default(), + ), + Some( + array_val + .get((alt_id * alt_id + 5 * alt_id + 4) / 2) + .unwrap() + .clone() + .unwrap_or_default(), + ), ])?; } else if array_val.len() == allele_count { column.push_vecstring(vec![