Skip to content

Commit

Permalink
Merge pull request #170 from databio/dev_io
Browse files Browse the repository at this point in the history
Release 0.4.1
  • Loading branch information
khoroshevskyi authored Sep 19, 2024
2 parents 440f979 + 42f3f03 commit 2809e8e
Show file tree
Hide file tree
Showing 13 changed files with 100 additions and 32 deletions.
Binary file not shown.
Binary file removed data/geniml_bb_cache/bedsets/BiocFileCache.sqlite
Binary file not shown.
3 changes: 0 additions & 3 deletions data/geniml_bb_cache/tokens.zarr/.zgroup

This file was deleted.

2 changes: 1 addition & 1 deletion geniml/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.0"
__version__ = "0.4.1"
14 changes: 7 additions & 7 deletions geniml/bbclient/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from logging import getLogger

from .const import MODULE_NAME
from .const import MODULE_NAME, DEFAULT_CACHE_FOLDER

_LOGGER = getLogger(MODULE_NAME)

Expand All @@ -12,7 +12,7 @@ def build_subparser_cache_bed(parser):
parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
parser.add_argument(
"--cache-folder",
default=None,
default=DEFAULT_CACHE_FOLDER,
help="Cache folder path (default: bed_cache)",
)

Expand All @@ -26,7 +26,7 @@ def build_subparser_cache_bedset(parser):
parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
parser.add_argument(
"--cache-folder",
default=None,
default=DEFAULT_CACHE_FOLDER,
help="Cache folder path (default: bed_cache)",
)

Expand All @@ -40,7 +40,7 @@ def build_subparser_seek(parser):
parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
parser.add_argument(
"--cache-folder",
default=None,
default=DEFAULT_CACHE_FOLDER,
help="Cache folder path (default: bed_cache)",
)

Expand All @@ -53,7 +53,7 @@ def build_subparser_inspect(parser):
"""
parser.add_argument(
"--cache-folder",
default=None,
default=DEFAULT_CACHE_FOLDER,
help="Cache folder path (default: bed_cache)",
)

Expand All @@ -75,7 +75,7 @@ def build_subparser_cache_tokens(parser):
)
parser.add_argument(
"--cache-folder",
default=None,
default=DEFAULT_CACHE_FOLDER,
help="Cache folder path (default: bed_cache)",
)

Expand All @@ -89,7 +89,7 @@ def build_subparser_remove(parser):
parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
parser.add_argument(
"--cache-folder",
default=None,
default=DEFAULT_CACHE_FOLDER,
help="Cache folder path (default: bed_cache)",
)

Expand Down
18 changes: 11 additions & 7 deletions geniml/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,18 @@ def main(test_args=None):
)

if args.command == "bbclient":
if args.subcommand is not None:
if args.subcommand in [
"cache-bed",
"cache-tokens",
"cache-bedset",
"seek",
"inspect",
"rm",
]:
_LOGGER.info(f"Subcommand: {args.subcommand}")
from .bbclient import BBClient

bbc = BBClient()
bbc = BBClient(cache_folder=args.cache_folder)

else:
# if no subcommand, print help format of bbclient subparser
Expand All @@ -141,11 +148,8 @@ def main(test_args=None):
if args.subcommand == "cache-bed":
# if input is a BED file path
if os.path.exists(args.identifier[0]):
from .io import RegionSet

bedfile = RegionSet(args.identifier[0])
bbc.add_bed_to_cache(bedfile)
_LOGGER.info(f"BED file {bedfile.compute_bed_identifier()} has been cached")
identifier = bbc.add_bed_to_cache(args.identifier[0])
_LOGGER.info(f"BED file {identifier} has been cached")
else:
bbc.load_bed(args.identifier[0])

Expand Down
41 changes: 36 additions & 5 deletions geniml/io/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,12 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False):
:param regions: path, or url to bed file or list of Region objects
:param backed: whether to load the bed file into memory or not [Default: False]
"""
# load from file
self._df: Union[pd.DataFrame, None] = None

if isinstance(regions, str):
self.backed = backed
self.regions: List[Region] = []
self.path = regions

self.regions = None
self.is_gzipped = False

Expand Down Expand Up @@ -90,6 +90,7 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False):
df = self._read_gzipped_file(regions)
else:
df = self._read_file_pd(regions, sep="\t", header=None, engine="pyarrow")
self._df = df

_regions = []
df.apply(
Expand All @@ -111,6 +112,15 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False):

self._identifier = None

def to_pandas(self) -> Union[pd.DataFrame, None]:
if self._df is None:
seqnames, starts, ends = zip(
*[(region.chr, region.start, region.end) for region in self]
)
return pd.DataFrame([seqnames, starts, ends])

return self._df

def _read_gzipped_file(self, file_path: str) -> pd.DataFrame:
"""
Read a gzipped file into a pandas dataframe
Expand Down Expand Up @@ -140,12 +150,33 @@ def _read_file_pd(self, *args, **kwargs) -> pd.DataFrame:
if row_count > 0:
_LOGGER.info(f"Skipped {row_count} rows while standardization. File: '{args}'")
df = df.dropna(axis=1)
return df
for index, row in df.iterrows():
if (
isinstance(row[0], str)
and isinstance(row[1], int)
and isinstance(row[2], int)
):
return df
else:
if isinstance(row[1], str):
try:
_ = int(row[1])
df[1] = pd.to_numeric(df[1])
except ValueError:
row_count += 1
break
if isinstance(row[2], str):
try:
_ = int(row[2])
df[2] = pd.to_numeric(df[2])
except ValueError:
row_count += 1
break
return df
except (pd.errors.ParserError, pd.errors.EmptyDataError) as _:
if row_count <= max_rows:
row_count += 1
# if can't open file after 5 attempts try to open it with gzip
return self._read_gzipped_file(*args)
raise BEDFileReadError("Cannot read bed file.")

def __len__(self):
return self.length
Expand Down
7 changes: 7 additions & 0 deletions tests/data/io_data/bed/s1_a_coments.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# THIS is big header
# with 4 lines
# and 3rd line
# is empty
chr1 10 30
chr1 110 130
chr1 210 230
5 changes: 1 addition & 4 deletions tests/data/io_data/bed/s1_a_headers.bed
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
# THIS is big header
# with 4 lines
# and 3rd line
# is empty
chrom_name one two
chr1 10 30
chr1 110 130
chr1 210 230
Empty file.
11 changes: 11 additions & 0 deletions tests/data/io_data/bed_bad/s1_many_headers.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# THIS is big header
# with 4 lines
# and 3rd line
# is empty
# THIS is big header
# with 4 lines
# and 3rd line
# is empty
chr1 10 30
chr1 110 130
chr1 210 230
File renamed without changes.
31 changes: 26 additions & 5 deletions tests/test_io.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os

import genomicranges
import pandas as pd
import pytest

from geniml.io.exceptions import GenimlBaseError
from geniml.io.exceptions import GenimlBaseError, BEDFileReadError
from geniml.io.io import SNP, Maf, Region, RegionSet

DATA_TEST_FOLDER = os.path.join(
Expand All @@ -14,11 +15,15 @@
)
DATA_TEST_FOLDER_BED = os.path.join(DATA_TEST_FOLDER, "bed")
DATA_TEST_FOLDER_MAF = os.path.join(DATA_TEST_FOLDER, "maf")
DATA_TEST_FOLDER_BED_BAD = os.path.join(DATA_TEST_FOLDER, "bed_bad")

ALL_BEDFILE_PATH = [
os.path.join(DATA_TEST_FOLDER_BED, x) for x in os.listdir(DATA_TEST_FOLDER_BED)
]
ALL_MAF_PATH = [os.path.join(DATA_TEST_FOLDER_MAF, x) for x in os.listdir(DATA_TEST_FOLDER_MAF)]
ALL_BADFILE_BAD_PATH = [
os.path.join(DATA_TEST_FOLDER_BED_BAD, x) for x in os.listdir(DATA_TEST_FOLDER_BED_BAD)
]


def test_make_region():
Expand Down Expand Up @@ -51,7 +56,7 @@ class TestRegionSet:
@pytest.mark.parametrize(
"url",
[
"ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7666nnn/GSM7666464/suppl/GSM7666464_18134-282-06_S51_L003_peaks.narrowPeak.gz"
"https://github.com/databio/geniml/raw/master/tests/data/io_data/bed/s1_a.bed.gz",
],
)
def test_region_set_from_url(self, url):
Expand All @@ -70,11 +75,15 @@ def test_region_set_from_path(self, url):
assert isinstance(region, Region)
break

@pytest.mark.parametrize("path", ALL_BADFILE_BAD_PATH)
def test_broken_bed_from_path(self, path):
with pytest.raises(BEDFileReadError):
region_set = RegionSet(path)

@pytest.mark.parametrize(
"url",
[
"ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7666nnn/GSM7666464/suppl/GSM7666464_18134-282-06_S51_L003_peaks.narrowPeak.gz"
], # This is not the right way how to do it!
["https://github.com/databio/geniml/raw/master/tests/data/io_data/bed/s1_a.bed.gz"],
# TODO: This is not the right way how to do it!
)
def test_region_set_from_url_cant_be_backed(self, url):
with pytest.raises(GenimlBaseError):
Expand Down Expand Up @@ -105,6 +114,18 @@ def test_calculation_id(self):
assert len(bedfile_id_2) == 32
assert bedfile_id_1 == bedfile_id_2 == bedfile_id_3

@pytest.mark.parametrize("url", ALL_BEDFILE_PATH)
def test_to_df(self, url):
region_set = RegionSet(url, backed=False)
pandas_df = region_set.to_pandas()
assert isinstance(pandas_df, pd.DataFrame)

@pytest.mark.parametrize("url", ALL_BEDFILE_PATH)
def test_to_df_backed(self, url):
region_set = RegionSet(url, backed=True)
pandas_df = region_set.to_pandas()
assert isinstance(pandas_df, pd.DataFrame)


class TestMaff:
@pytest.mark.parametrize("path", ALL_MAF_PATH)
Expand Down

0 comments on commit 2809e8e

Please sign in to comment.