Skip to content

Commit

Permalink
Add readers and writers for GRL
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche committed Jan 23, 2024
1 parent 739f325 commit c322617
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/dolomite_ranges/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,5 @@
from .read_sequence_information import read_sequence_information
from .save_genomic_ranges import save_genomic_ranges
from .read_genomic_ranges import read_genomic_ranges
from .save_genomic_ranges_list import save_genomic_ranges_list
from .read_genomic_ranges_list import read_genomic_ranges_list
74 changes: 74 additions & 0 deletions src/dolomite_ranges/read_genomic_ranges_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os

import dolomite_base as dl
import h5py
from dolomite_base.read_object import registry
from genomicranges import GenomicRangesList

from .read_genomic_ranges import read_genomic_ranges

registry["genomic_ranges_list"] = "dolomite_ranges.read_genomic_ranges_list"


def read_genomic_ranges_list(path: str, metadata: dict, **kwargs) -> GenomicRangesList:
"""Load genomic ranges into a
:py:class:`~genomicranges.GenomicRanges.GenomicRangesList` object.
This method
should generally not be called directly but instead be invoked by
:py:meth:`~dolomite_base.read_object.read_object`.
Args:
path:
Path to the directory containing the object.
metadata:
Metadata for the object.
kwargs:
Further arguments, ignored.
Returns:
A :py:class:`~genomicranges.GenomicRangesList.GenomicRangesList` object.
"""

with h5py.File(os.path.join(path, "partitions.h5"), "r") as handle:
ghandle = handle["genomic_ranges_list"]

lengths = dl.load_vector_from_hdf5(
ghandle["lengths"], expected_type=int, report_1darray=True
)

names = None
if "names" in ghandle:
names = dl.load_vector_from_hdf5(
ghandle["names"], expected_type=str, report_1darray=True
)

_all_granges = read_genomic_ranges(
path=os.path.join(path, "concatenated"), metadata=None
)

counter = 0
_split_granges = []
if lengths.sum() == 0:
_split_granges = _all_granges
else:
for ilen in lengths:
_frag = _all_granges[counter : (counter + ilen)]
_split_granges.append(_frag)
counter += ilen

grl = GenomicRangesList(names=names, range_lengths=lengths, ranges=_split_granges)

_elem_annotation_path = os.path.join(path, "element_annotations")
if os.path.exists(_elem_annotation_path):
_mcols = dl.read_object(_elem_annotation_path)
grl = grl.set_mcols(_mcols)

_meta_path = os.path.join(path, "other_annotations")
if os.path.exists(_meta_path):
_meta = dl.read_object(_meta_path)
grl = grl.set_metadata(_meta.as_dict())

return grl
60 changes: 60 additions & 0 deletions src/dolomite_ranges/save_genomic_ranges_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import os

import dolomite_base as dl
import h5py
from biocutils import combine_sequences
from genomicranges import GenomicRangesList


@dl.save_object.register
@dl.validate_saves
def save_genomic_ranges_list(x: GenomicRangesList, path: str, **kwargs):
"""Method for saving :py:class:`~genomicranges.GenomicRanges.GenomicRangesList`
objects to their corresponding file representations, see
:py:meth:`~dolomite_base.save_object.save_object` for details.
Args:
x:
Object to be staged.
path:
Path to a directory in which to save ``x``.
kwargs:
Further arguments to be passed to individual methods.
Returns:
`x` is saved to `path`.
"""
os.mkdir(path)

with open(os.path.join(path, "OBJECT"), "w", encoding="utf-8") as handle:
handle.write(
'{ "type": "genomic_ranges_list", "genomic_ranges_list": { "version": "1.0" } }'
)

with h5py.File(os.path.join(path, "partitions.h5"), "w") as handle:
ghandle = handle.create_group("genomic_ranges_list")

dl.write_integer_vector_to_hdf5(
ghandle, name="lengths", h5type="u4", x=x.get_range_lengths()
)

if x.get_names() is not None:
dl.write_string_vector_to_hdf5(ghandle, name="names", x=x.get_names())

_all_ranges = x.get_ranges()
if isinstance(_all_ranges, list) and len(_all_ranges) > 1:
_all_ranges = combine_sequences(*x.get_ranges())

dl.save_object(_all_ranges, path=os.path.join(path, "concatenated"))

_elem_annotation = x.get_mcols()
if _elem_annotation is not None and _elem_annotation.shape[1] > 0:
dl.save_object(_elem_annotation, path=os.path.join(path, "element_annotations"))

_meta = x.get_metadata()
if _meta is not None and len(_meta) > 0:
dl.save_object(_meta, path=os.path.join(path, "other_annotations"))

return
47 changes: 47 additions & 0 deletions tests/test_granges_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os
from tempfile import mkdtemp

from biocframe import BiocFrame
from dolomite_base import read_object, save_object
from genomicranges import GenomicRanges, GenomicRangesList
from iranges import IRanges
import dolomite_ranges


def test_genomic_ranges_list():
a = GenomicRanges(
seqnames=["chr1", "chr2", "chr1", "chr3"],
ranges=IRanges([1, 3, 2, 4], [10, 30, 50, 60]),
strand=["-", "+", "*", "+"],
mcols=BiocFrame({"score": [1, 2, 3, 4]}),
)

b = GenomicRanges(
seqnames=["chr2", "chr4", "chr5"],
ranges=IRanges([3, 6, 4], [30, 50, 60]),
strand=["-", "+", "*"],
mcols=BiocFrame({"score": [2, 3, 4]}),
)

grl = GenomicRangesList(ranges=[a, b], names=["a", "b"])

dir = os.path.join(mkdtemp(), "granges")
save_object(grl, dir)

roundtrip = read_object(dir)
assert roundtrip.get_names() == grl.get_names()
assert len(roundtrip.get_ranges()) == len(grl.get_ranges())
assert (roundtrip["a"].get_start() == grl["a"].get_start()).all()
assert (roundtrip["a"].get_strand() == grl["a"].get_strand()).all()
assert (roundtrip.get_range_lengths() == grl.get_range_lengths()).all()

def test_genomic_ranges_list_empty():
grl = GenomicRangesList.empty(n=100)

dir = os.path.join(mkdtemp(), "granges_empty")
save_object(grl, dir)

roundtrip = read_object(dir)
assert roundtrip.get_names() == grl.get_names()
assert len(roundtrip.get_ranges()) == len(grl.get_ranges())
assert (roundtrip.get_range_lengths() == grl.get_range_lengths()).all()

0 comments on commit c322617

Please sign in to comment.