From fa9476bb27be3b80d747e63b7700ac5f42f24028 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 18 Sep 2024 14:55:55 -0400 Subject: [PATCH 1/4] improved opening bed files --- geniml/io/io.py | 41 +++++++++++++++--- tests/data/io_data/bed/s1_a_coments.bed | 7 +++ tests/data/io_data/bed/s1_a_headers.bed | 5 +-- tests/data/io_data/bed_bad/s1_empty.bed | 0 .../data/io_data/bed_bad/s1_many_headers.bed | 11 +++++ tests/data/io_data/{bed => }/s1_a_bed_gz | Bin tests/test_io.py | 29 +++++++++++-- 7 files changed, 81 insertions(+), 12 deletions(-) create mode 100644 tests/data/io_data/bed/s1_a_coments.bed create mode 100644 tests/data/io_data/bed_bad/s1_empty.bed create mode 100644 tests/data/io_data/bed_bad/s1_many_headers.bed rename tests/data/io_data/{bed => }/s1_a_bed_gz (100%) diff --git a/geniml/io/io.py b/geniml/io/io.py index 657ea7fe..98df9ddf 100644 --- a/geniml/io/io.py +++ b/geniml/io/io.py @@ -55,12 +55,12 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False): :param regions: path, or url to bed file or list of Region objects :param backed: whether to load the bed file into memory or not [Default: False] """ - # load from file + self._df: Union[pd.DataFrame, None] = None + if isinstance(regions, str): self.backed = backed self.regions: List[Region] = [] self.path = regions - self.regions = None self.is_gzipped = False @@ -90,6 +90,7 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False): df = self._read_gzipped_file(regions) else: df = self._read_file_pd(regions, sep="\t", header=None, engine="pyarrow") + self._df = df _regions = [] df.apply( @@ -111,6 +112,15 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False): self._identifier = None + def to_pandas(self) -> Union[pd.DataFrame, None]: + if self._df is None: + seqnames, starts, ends = zip( + *[(region.chr, region.start, region.end) for region in self] + ) + return pd.DataFrame([seqnames, starts, ends]) + + return self._df + def _read_gzipped_file(self, file_path: str) -> pd.DataFrame: """ Read a gzipped file into a pandas dataframe @@ -140,12 +150,33 @@ def _read_file_pd(self, *args, **kwargs) -> pd.DataFrame: if row_count > 0: _LOGGER.info(f"Skipped {row_count} rows while standardization. File: '{args}'") df = df.dropna(axis=1) - return df + for index, row in df.iterrows(): + if ( + isinstance(row[0], str) + and isinstance(row[1], int) + and isinstance(row[2], int) + ): + return df + else: + if isinstance(row[1], str): + try: + _ = int(row[1]) + df[1] = pd.to_numeric(df[1]) + except ValueError: + row_count += 1 + break + if isinstance(row[2], str): + try: + _ = int(row[2]) + df[2] = pd.to_numeric(df[2]) + except ValueError: + row_count += 1 + break + return df except (pd.errors.ParserError, pd.errors.EmptyDataError) as _: if row_count <= max_rows: row_count += 1 - # if can't open file after 5 attempts try to open it with gzip - return self._read_gzipped_file(*args) + raise BEDFileReadError("Cannot read bed file.") def __len__(self): return self.length diff --git a/tests/data/io_data/bed/s1_a_coments.bed b/tests/data/io_data/bed/s1_a_coments.bed new file mode 100644 index 00000000..107ad60f --- /dev/null +++ b/tests/data/io_data/bed/s1_a_coments.bed @@ -0,0 +1,7 @@ +# THIS is big header +# with 4 lines +# and 3rd line +# is empty +chr1 10 30 +chr1 110 130 +chr1 210 230 diff --git a/tests/data/io_data/bed/s1_a_headers.bed b/tests/data/io_data/bed/s1_a_headers.bed index 107ad60f..8e7e103b 100644 --- a/tests/data/io_data/bed/s1_a_headers.bed +++ b/tests/data/io_data/bed/s1_a_headers.bed @@ -1,7 +1,4 @@ -# THIS is big header -# with 4 lines -# and 3rd line -# is empty +chrom_name one two chr1 10 30 chr1 110 130 chr1 210 230 diff --git a/tests/data/io_data/bed_bad/s1_empty.bed b/tests/data/io_data/bed_bad/s1_empty.bed new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/io_data/bed_bad/s1_many_headers.bed b/tests/data/io_data/bed_bad/s1_many_headers.bed new file mode 100644 index 00000000..8980e76e --- /dev/null +++ b/tests/data/io_data/bed_bad/s1_many_headers.bed @@ -0,0 +1,11 @@ +# THIS is big header +# with 4 lines +# and 3rd line +# is empty +# THIS is big header +# with 4 lines +# and 3rd line +# is empty +chr1 10 30 +chr1 110 130 +chr1 210 230 diff --git a/tests/data/io_data/bed/s1_a_bed_gz b/tests/data/io_data/s1_a_bed_gz similarity index 100% rename from tests/data/io_data/bed/s1_a_bed_gz rename to tests/data/io_data/s1_a_bed_gz diff --git a/tests/test_io.py b/tests/test_io.py index a6cfba3c..93d8f805 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1,9 +1,10 @@ import os import genomicranges +import pandas as pd import pytest -from geniml.io.exceptions import GenimlBaseError +from geniml.io.exceptions import GenimlBaseError, BEDFileReadError from geniml.io.io import SNP, Maf, Region, RegionSet DATA_TEST_FOLDER = os.path.join( @@ -14,11 +15,15 @@ ) DATA_TEST_FOLDER_BED = os.path.join(DATA_TEST_FOLDER, "bed") DATA_TEST_FOLDER_MAF = os.path.join(DATA_TEST_FOLDER, "maf") +DATA_TEST_FOLDET_BAD_BED = os.path.join(DATA_TEST_FOLDER, "bed_bad") ALL_BEDFILE_PATH = [ os.path.join(DATA_TEST_FOLDER_BED, x) for x in os.listdir(DATA_TEST_FOLDER_BED) ] ALL_MAF_PATH = [os.path.join(DATA_TEST_FOLDER_MAF, x) for x in os.listdir(DATA_TEST_FOLDER_MAF)] +ALL_BADFILE_BAD_PATH = [ + os.path.join(DATA_TEST_FOLDET_BAD_BED, x) for x in os.listdir(DATA_TEST_FOLDET_BAD_BED) +] def test_make_region(): @@ -51,7 +56,7 @@ class TestRegionSet: @pytest.mark.parametrize( "url", [ - "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7666nnn/GSM7666464/suppl/GSM7666464_18134-282-06_S51_L003_peaks.narrowPeak.gz" + "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7666nnn/GSM7666464/suppl/GSM7666464_18134-282-06_S51_L003_peaks.narrowPeak.gz", ], ) def test_region_set_from_url(self, url): @@ -70,11 +75,17 @@ def test_region_set_from_path(self, url): assert isinstance(region, Region) break + @pytest.mark.parametrize("path", ALL_BADFILE_BAD_PATH) + def test_broken_bed_from_path(self, path): + with pytest.raises(BEDFileReadError): + region_set = RegionSet(path) + @pytest.mark.parametrize( "url", [ "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7666nnn/GSM7666464/suppl/GSM7666464_18134-282-06_S51_L003_peaks.narrowPeak.gz" - ], # This is not the right way how to do it! + ], + # TODO: This is not the right way how to do it! ) def test_region_set_from_url_cant_be_backed(self, url): with pytest.raises(GenimlBaseError): @@ -105,6 +116,18 @@ def test_calculation_id(self): assert len(bedfile_id_2) == 32 assert bedfile_id_1 == bedfile_id_2 == bedfile_id_3 + @pytest.mark.parametrize("url", ALL_BEDFILE_PATH) + def test_to_df(self, url): + region_set = RegionSet(url, backed=False) + pandas_df = region_set.to_pandas() + assert isinstance(pandas_df, pd.DataFrame) + + @pytest.mark.parametrize("url", ALL_BEDFILE_PATH) + def test_to_df_backed(self, url): + region_set = RegionSet(url, backed=True) + pandas_df = region_set.to_pandas() + assert isinstance(pandas_df, pd.DataFrame) + class TestMaff: @pytest.mark.parametrize("path", ALL_MAF_PATH) From 7e78cc05aea3fc3b1a9f5fdd734c40c54c1af163 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 18 Sep 2024 15:23:28 -0400 Subject: [PATCH 2/4] Fixed #164 --- geniml/_version.py | 2 +- geniml/bbclient/cli.py | 14 +++++++------- geniml/cli.py | 18 +++++++++++------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/geniml/_version.py b/geniml/_version.py index 6a9beea8..3d26edf7 100644 --- a/geniml/_version.py +++ b/geniml/_version.py @@ -1 +1 @@ -__version__ = "0.4.0" +__version__ = "0.4.1" diff --git a/geniml/bbclient/cli.py b/geniml/bbclient/cli.py index 98df48f9..7d2fd270 100755 --- a/geniml/bbclient/cli.py +++ b/geniml/bbclient/cli.py @@ -1,6 +1,6 @@ from logging import getLogger -from .const import MODULE_NAME +from .const import MODULE_NAME, DEFAULT_BEDBASE_API _LOGGER = getLogger(MODULE_NAME) @@ -12,7 +12,7 @@ def build_subparser_cache_bed(parser): parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") parser.add_argument( "--cache-folder", - default=None, + default=DEFAULT_BEDBASE_API, help="Cache folder path (default: bed_cache)", ) @@ -26,7 +26,7 @@ def build_subparser_cache_bedset(parser): parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") parser.add_argument( "--cache-folder", - default=None, + default=DEFAULT_BEDBASE_API, help="Cache folder path (default: bed_cache)", ) @@ -40,7 +40,7 @@ def build_subparser_seek(parser): parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") parser.add_argument( "--cache-folder", - default=None, + default=DEFAULT_BEDBASE_API, help="Cache folder path (default: bed_cache)", ) @@ -53,7 +53,7 @@ def build_subparser_inspect(parser): """ parser.add_argument( "--cache-folder", - default=None, + default=DEFAULT_BEDBASE_API, help="Cache folder path (default: bed_cache)", ) @@ -75,7 +75,7 @@ def build_subparser_cache_tokens(parser): ) parser.add_argument( "--cache-folder", - default=None, + default=DEFAULT_BEDBASE_API, help="Cache folder path (default: bed_cache)", ) @@ -89,7 +89,7 @@ def build_subparser_remove(parser): parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") parser.add_argument( "--cache-folder", - default=None, + default=DEFAULT_BEDBASE_API, help="Cache folder path (default: bed_cache)", ) diff --git a/geniml/cli.py b/geniml/cli.py index d7abccf6..63109063 100644 --- a/geniml/cli.py +++ b/geniml/cli.py @@ -114,11 +114,18 @@ def main(test_args=None): ) if args.command == "bbclient": - if args.subcommand is not None: + if args.subcommand in [ + "cache-bed", + "cache-tokens", + "cache-bedset", + "seek", + "inspect", + "rm", + ]: _LOGGER.info(f"Subcommand: {args.subcommand}") from .bbclient import BBClient - bbc = BBClient() + bbc = BBClient(cache_folder=args.cache_folder) else: # if no subcommand, print help format of bbclient subparser @@ -141,11 +148,8 @@ def main(test_args=None): if args.subcommand == "cache-bed": # if input is a BED file path if os.path.exists(args.identifier[0]): - from .io import RegionSet - - bedfile = RegionSet(args.identifier[0]) - bbc.add_bed_to_cache(bedfile) - _LOGGER.info(f"BED file {bedfile.compute_bed_identifier()} has been cached") + identifier = bbc.add_bed_to_cache(args.identifier[0]) + _LOGGER.info(f"BED file {identifier} has been cached") else: bbc.load_bed(args.identifier[0]) From 95cd35b0517ad54e64e12cad1c91c11147aae864 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 19 Sep 2024 10:29:13 -0400 Subject: [PATCH 3/4] fixed incorrect constant issue --- geniml/bbclient/cli.py | 14 +++++++------- tests/test_io.py | 10 ++++------ 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/geniml/bbclient/cli.py b/geniml/bbclient/cli.py index 7d2fd270..69270bf6 100755 --- a/geniml/bbclient/cli.py +++ b/geniml/bbclient/cli.py @@ -1,6 +1,6 @@ from logging import getLogger -from .const import MODULE_NAME, DEFAULT_BEDBASE_API +from .const import MODULE_NAME, DEFAULT_CACHE_FOLDER _LOGGER = getLogger(MODULE_NAME) @@ -12,7 +12,7 @@ def build_subparser_cache_bed(parser): parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") parser.add_argument( "--cache-folder", - default=DEFAULT_BEDBASE_API, + default=DEFAULT_CACHE_FOLDER, help="Cache folder path (default: bed_cache)", ) @@ -26,7 +26,7 @@ def build_subparser_cache_bedset(parser): parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") parser.add_argument( "--cache-folder", - default=DEFAULT_BEDBASE_API, + default=DEFAULT_CACHE_FOLDER, help="Cache folder path (default: bed_cache)", ) @@ -40,7 +40,7 @@ def build_subparser_seek(parser): parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") parser.add_argument( "--cache-folder", - default=DEFAULT_BEDBASE_API, + default=DEFAULT_CACHE_FOLDER, help="Cache folder path (default: bed_cache)", ) @@ -53,7 +53,7 @@ def build_subparser_inspect(parser): """ parser.add_argument( "--cache-folder", - default=DEFAULT_BEDBASE_API, + default=DEFAULT_CACHE_FOLDER, help="Cache folder path (default: bed_cache)", ) @@ -75,7 +75,7 @@ def build_subparser_cache_tokens(parser): ) parser.add_argument( "--cache-folder", - default=DEFAULT_BEDBASE_API, + default=DEFAULT_CACHE_FOLDER, help="Cache folder path (default: bed_cache)", ) @@ -89,7 +89,7 @@ def build_subparser_remove(parser): parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") parser.add_argument( "--cache-folder", - default=DEFAULT_BEDBASE_API, + default=DEFAULT_CACHE_FOLDER, help="Cache folder path (default: bed_cache)", ) diff --git a/tests/test_io.py b/tests/test_io.py index 93d8f805..43ffa52f 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -15,14 +15,14 @@ ) DATA_TEST_FOLDER_BED = os.path.join(DATA_TEST_FOLDER, "bed") DATA_TEST_FOLDER_MAF = os.path.join(DATA_TEST_FOLDER, "maf") -DATA_TEST_FOLDET_BAD_BED = os.path.join(DATA_TEST_FOLDER, "bed_bad") +DATA_TEST_FOLDER_BED_BAD = os.path.join(DATA_TEST_FOLDER, "bed_bad") ALL_BEDFILE_PATH = [ os.path.join(DATA_TEST_FOLDER_BED, x) for x in os.listdir(DATA_TEST_FOLDER_BED) ] ALL_MAF_PATH = [os.path.join(DATA_TEST_FOLDER_MAF, x) for x in os.listdir(DATA_TEST_FOLDER_MAF)] ALL_BADFILE_BAD_PATH = [ - os.path.join(DATA_TEST_FOLDET_BAD_BED, x) for x in os.listdir(DATA_TEST_FOLDET_BAD_BED) + os.path.join(DATA_TEST_FOLDER_BED_BAD, x) for x in os.listdir(DATA_TEST_FOLDER_BED_BAD) ] @@ -56,7 +56,7 @@ class TestRegionSet: @pytest.mark.parametrize( "url", [ - "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7666nnn/GSM7666464/suppl/GSM7666464_18134-282-06_S51_L003_peaks.narrowPeak.gz", + "https://github.com/databio/geniml/raw/master/tests/data/io_data/bed/s1_a.bed.gz", ], ) def test_region_set_from_url(self, url): @@ -82,9 +82,7 @@ def test_broken_bed_from_path(self, path): @pytest.mark.parametrize( "url", - [ - "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7666nnn/GSM7666464/suppl/GSM7666464_18134-282-06_S51_L003_peaks.narrowPeak.gz" - ], + ["https://github.com/databio/geniml/raw/master/tests/data/io_data/bed/s1_a.bed.gz"], # TODO: This is not the right way how to do it! ) def test_region_set_from_url_cant_be_backed(self, url): From 42f3f03f0986638c8488793120acc22147422d0a Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 19 Sep 2024 10:38:28 -0400 Subject: [PATCH 4/4] cleand unused data --- .../bedfiles/BiocFileCache.sqlite | Bin 24576 -> 0 bytes .../bedsets/BiocFileCache.sqlite | Bin 24576 -> 0 bytes data/geniml_bb_cache/tokens.zarr/.zgroup | 3 --- 3 files changed, 3 deletions(-) delete mode 100644 data/geniml_bb_cache/bedfiles/BiocFileCache.sqlite delete mode 100644 data/geniml_bb_cache/bedsets/BiocFileCache.sqlite delete mode 100644 data/geniml_bb_cache/tokens.zarr/.zgroup diff --git a/data/geniml_bb_cache/bedfiles/BiocFileCache.sqlite b/data/geniml_bb_cache/bedfiles/BiocFileCache.sqlite deleted file mode 100644 index d58e4e47a585529a7046e687ecc3b0bee5f5ede2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24576 zcmeI(O>fgM7zc2BeSt=*0;h^YC2OZfpaL3~-6*X=)!J;NskEIUukn~g)0V`Yx+92B z#E~z+N8!eqBTm5G0umBOs=pO0j_ug-?@V9!a^OGqeK>z{}fB*y_009U<00I!$K>@#3 zZ+UGh)gqP~wTV)hE!XKxMk=<&dOzU3kh7sb;HRwIhaLOO_G29tdmoOfcUtW>{iw7! zpG(_j&$VA;g}oOee(O%Bp<**Z$Y1e*`Ntvir=!s$<}^}MAM(?%C@(}Jw^y@N3N0h8 zOx?gdUpO3dHsCLN(^1Ge{b>+zKaBF)N!S~oc=-roHj`QQZ}+4tp)a?yruuqy^9&cI zwz0(vA=_%td!-23TGnwI#ofgMN)fse z^Yx8LX$?Q;bG*lU^;W-4NwJLV-Q4WyBeB-Y;ySW(L{Dw0VqUbn`ODTcnYv$zIglU# z0SG_<0uX=z1Rwwb2tWV=5ct0ZTD0Ol$Yz%^5z&=QGqqei=^h?-pE+h-)cr=xfdl~v zKmY;|fB*y_009U<00Izzz)lFfgM7zc2BeSt=*0;h^YC2OZfpaL3~-6*X=)!J;NskEIUukn~g)0V`Yx+92B z#E~z+N8!eqBTm5G0umBOs=pO0j_ug-?@V9!a^OGqeK>z{}fB*y_009U<00I!$K>@#3 zZ+UGh)gqP~wTV)hE!XKxMk=<&dOzU3kh7sb;HRwIhaLOO_G29tdmoOfcUtW>{iw7! zpG(_j&$VA;g}oOee(O%Bp<**Z$Y1e*`Ntvir=!s$<}^}MAM(?%C@(}Jw^y@N3N0h8 zOx?gdUpO3dHsCLN(^1Ge{b>+zKaBF)N!S~oc=-roHj`QQZ}+4tp)a?yruuqy^9&cI zwz0(vA=_%td!-23TGnwI#ofgMN)fse z^Yx8LX$?Q;bG*lU^;W-4NwJLV-Q4WyBeB-Y;ySW(L{Dw0VqUbn`ODTcnYv$zIglU# z0SG_<0uX=z1Rwwb2tWV=5ct0ZTD0Ol$Yz%^5z&=QGqqei=^h?-pE+h-)cr=xfdl~v zKmY;|fB*y_009U<00Izzz)lF