Skip to content

Commit

Permalink
Deprecate functions for reading VCF
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Jun 27, 2024
1 parent 155fbb7 commit 9b3ca36
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 0 deletions.
3 changes: 3 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ PLINK
VCF (reading)
-------------

.. deprecated:: 0.9.0
Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.

.. currentmodule:: sgkit.io.vcf
.. autosummary::
:toctree: generated/
Expand Down
3 changes: 3 additions & 0 deletions docs/vcf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
Reading VCF
===========

.. deprecated:: 0.9.0
Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.

.. contents:: Table of contents:
:local:

Expand Down
11 changes: 11 additions & 0 deletions sgkit/io/vcf/vcf_partition.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from typing import Any, Dict, Optional, Sequence, Union

import dask
Expand Down Expand Up @@ -78,6 +79,9 @@ def partition_into_regions(
"""
Calculate genomic region strings to partition a compressed VCF or BCF file into roughly equal parts.
.. deprecated:: 0.9.0
Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
A ``.tbi`` or ``.csi`` file is used to find BGZF boundaries in the compressed VCF file, which are then
used to divide the file into parts.
Expand Down Expand Up @@ -118,6 +122,13 @@ def partition_into_regions(
ValueError
If either of ``num_parts`` or ``target_part_size`` is not a positive integer.
"""

warnings.warn(
"Functions for reading VCF are deprecated, please use the bio2zarr package.",
DeprecationWarning,
stacklevel=2,
)

if num_parts is None and target_part_size is None:
raise ValueError("One of num_parts or target_part_size must be specified")

Expand Down
45 changes: 45 additions & 0 deletions sgkit/io/vcf/vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,9 @@ def vcf_to_zarrs(
) -> Sequence[str]:
"""Convert VCF files to multiple Zarr on-disk stores, one per region.
.. deprecated:: 0.9.0
Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
Parameters
----------
input
Expand Down Expand Up @@ -754,6 +757,12 @@ def vcf_to_zarrs(
A list of URLs to the Zarr outputs.
"""

warnings.warn(
"Functions for reading VCF are deprecated, please use the bio2zarr package.",
DeprecationWarning,
stacklevel=2,
)

output_storage_options = output_storage_options or {}

tasks = []
Expand Down Expand Up @@ -798,6 +807,9 @@ def concat_zarrs(
) -> None:
"""Concatenate multiple Zarr stores into a single Zarr store.
.. deprecated:: 0.9.0
Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
The Zarr stores are concatenated and rechunked to produce a single combined store.
Parameters
Expand All @@ -814,6 +826,12 @@ def concat_zarrs(
the chunk length of the first input Zarr store is used.
"""

warnings.warn(
"Functions for reading VCF are deprecated, please use the bio2zarr package.",
DeprecationWarning,
stacklevel=2,
)

vars_to_rechunk = []
vars_to_copy = []
storage_options = storage_options or {}
Expand Down Expand Up @@ -856,6 +874,9 @@ def vcf_to_zarr(
) -> None:
"""Convert VCF files to a single Zarr on-disk store.
.. deprecated:: 0.9.0
Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
By default, the conversion is carried out in parallel, by writing the output for each
part to a separate, intermediate Zarr store in ``tempdir``. Then, in a second step
the intermediate outputs are concatenated and rechunked into the final output Zarr
Expand Down Expand Up @@ -955,6 +976,12 @@ def vcf_to_zarr(
so for large VCF files this can be slow.
"""

warnings.warn(
"Functions for reading VCF are deprecated, please use the bio2zarr package.",
DeprecationWarning,
stacklevel=2,
)

if temp_chunk_length is not None:
if chunk_length % temp_chunk_length != 0:
raise ValueError(
Expand Down Expand Up @@ -1039,6 +1066,9 @@ def read_vcf(
) -> xr.Dataset:
"""Read VCF dataset.
.. deprecated:: 0.9.0
Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
A convenience for :func:`vcf_to_zarr` followed by :func:`sgkit.load_dataset`.
Note that the output Zarr store in ``tempdir`` is not deleted after this function
returns, so must be deleted manually by the user.
Expand Down Expand Up @@ -1119,6 +1149,12 @@ def read_vcf(
"""

warnings.warn(
"Functions for reading VCF are deprecated, please use the bio2zarr package.",
DeprecationWarning,
stacklevel=2,
)

# Need to retain zarr file backing the returned dataset
with temporary_directory(
prefix="read_vcf_",
Expand Down Expand Up @@ -1166,6 +1202,9 @@ def zarr_array_sizes(
) -> Dict[str, Any]:
"""Make a pass through a VCF/BCF file to determine sizes for storage in Zarr.
.. deprecated:: 0.9.0
Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
By default, the input is processed in parts in parallel. However, if the input
is a single file, ``target_part_size`` is None, and ``regions`` is None,
then the operation will be carried out sequentially.
Expand All @@ -1188,6 +1227,12 @@ def zarr_array_sizes(
are not None.
"""

warnings.warn(
"Functions for reading VCF are deprecated, please use the bio2zarr package.",
DeprecationWarning,
stacklevel=2,
)

return process_vcfs(
input,
zarr_array_sizes_sequential,
Expand Down
9 changes: 9 additions & 0 deletions sgkit/io/vcfzarr_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ def read_scikit_allel_vcfzarr(
) -> xr.Dataset:
"""Read a VCF Zarr file created using scikit-allel.
.. deprecated:: 0.9.0
Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
Loads VCF variant, sample, and genotype data as Dask arrays within a Dataset
from a Zarr file created using scikit-allel's ``vcf_to_zarr`` function.
Expand Down Expand Up @@ -90,6 +93,12 @@ def read_scikit_allel_vcfzarr(
- :data:`sgkit.variables.call_genotype_mask_spec` (variants, samples, ploidy)
"""

warnings.warn(
"Functions for reading VCF are deprecated, please use the bio2zarr package.",
DeprecationWarning,
stacklevel=2,
)

vcfzarr = zarr.open_group(str(path), mode="r")

# don't fix strings since it requires a pass over the whole dataset
Expand Down

0 comments on commit 9b3ca36

Please sign in to comment.