Merge pull request #170 from databio/dev_io

Release 0.4.1
databio · Sep 19, 2024 · 2809e8e · 2809e8e
2 parents 440f979 + 42f3f03
commit 2809e8e
Show file tree

Hide file tree

Showing 13 changed files with 100 additions and 32 deletions.
diff --git a/data/geniml_bb_cache/bedfiles/BiocFileCache.sqlite b/data/geniml_bb_cache/bedfiles/BiocFileCache.sqlite
diff --git a/data/geniml_bb_cache/bedsets/BiocFileCache.sqlite b/data/geniml_bb_cache/bedsets/BiocFileCache.sqlite
diff --git a/data/geniml_bb_cache/tokens.zarr/.zgroup b/data/geniml_bb_cache/tokens.zarr/.zgroup
diff --git a/geniml/_version.py b/geniml/_version.py
@@ -1 +1 @@
-__version__ = "0.4.0"
+__version__ = "0.4.1"
diff --git a/geniml/bbclient/cli.py b/geniml/bbclient/cli.py
@@ -1,6 +1,6 @@
 from logging import getLogger
 
-from .const import MODULE_NAME
+from .const import MODULE_NAME, DEFAULT_CACHE_FOLDER
 
 _LOGGER = getLogger(MODULE_NAME)
 
@@ -12,7 +12,7 @@ def build_subparser_cache_bed(parser):
     parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
     parser.add_argument(
         "--cache-folder",
-        default=None,
+        default=DEFAULT_CACHE_FOLDER,
         help="Cache folder path (default: bed_cache)",
     )
 
@@ -26,7 +26,7 @@ def build_subparser_cache_bedset(parser):
     parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
     parser.add_argument(
         "--cache-folder",
-        default=None,
+        default=DEFAULT_CACHE_FOLDER,
         help="Cache folder path (default: bed_cache)",
     )
 
@@ -40,7 +40,7 @@ def build_subparser_seek(parser):
     parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
     parser.add_argument(
         "--cache-folder",
-        default=None,
+        default=DEFAULT_CACHE_FOLDER,
         help="Cache folder path (default: bed_cache)",
     )
 
@@ -53,7 +53,7 @@ def build_subparser_inspect(parser):
     """
     parser.add_argument(
         "--cache-folder",
-        default=None,
+        default=DEFAULT_CACHE_FOLDER,
         help="Cache folder path (default: bed_cache)",
     )
 
@@ -75,7 +75,7 @@ def build_subparser_cache_tokens(parser):
     )
     parser.add_argument(
         "--cache-folder",
-        default=None,
+        default=DEFAULT_CACHE_FOLDER,
         help="Cache folder path (default: bed_cache)",
     )
 
@@ -89,7 +89,7 @@ def build_subparser_remove(parser):
     parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
     parser.add_argument(
         "--cache-folder",
-        default=None,
+        default=DEFAULT_CACHE_FOLDER,
         help="Cache folder path (default: bed_cache)",
     )
 

diff --git a/geniml/cli.py b/geniml/cli.py
@@ -114,11 +114,18 @@ def main(test_args=None):
         )
 
     if args.command == "bbclient":
-        if args.subcommand is not None:
+        if args.subcommand in [
+            "cache-bed",
+            "cache-tokens",
+            "cache-bedset",
+            "seek",
+            "inspect",
+            "rm",
+        ]:
             _LOGGER.info(f"Subcommand: {args.subcommand}")
             from .bbclient import BBClient
 
-            bbc = BBClient()
+            bbc = BBClient(cache_folder=args.cache_folder)
 
         else:
             # if no subcommand, print help format of bbclient subparser
@@ -141,11 +148,8 @@ def main(test_args=None):
         if args.subcommand == "cache-bed":
             # if input is a BED file path
             if os.path.exists(args.identifier[0]):
-                from .io import RegionSet
-
-                bedfile = RegionSet(args.identifier[0])
-                bbc.add_bed_to_cache(bedfile)
-                _LOGGER.info(f"BED file {bedfile.compute_bed_identifier()} has been cached")
+                identifier = bbc.add_bed_to_cache(args.identifier[0])
+                _LOGGER.info(f"BED file {identifier} has been cached")
             else:
                 bbc.load_bed(args.identifier[0])
 

diff --git a/geniml/io/io.py b/geniml/io/io.py
@@ -55,12 +55,12 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False):
         :param regions: path, or url to bed file or list of Region objects
         :param backed: whether to load the bed file into memory or not [Default: False]
         """
-        # load from file
+        self._df: Union[pd.DataFrame, None] = None
+
         if isinstance(regions, str):
             self.backed = backed
             self.regions: List[Region] = []
             self.path = regions
-
             self.regions = None
             self.is_gzipped = False
 
@@ -90,6 +90,7 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False):
                     df = self._read_gzipped_file(regions)
                 else:
                     df = self._read_file_pd(regions, sep="\t", header=None, engine="pyarrow")
+                self._df = df
 
                 _regions = []
                 df.apply(
@@ -111,6 +112,15 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False):
 
         self._identifier = None
 
+    def to_pandas(self) -> Union[pd.DataFrame, None]:
+        if self._df is None:
+            seqnames, starts, ends = zip(
+                *[(region.chr, region.start, region.end) for region in self]
+            )
+            return pd.DataFrame([seqnames, starts, ends])
+
+        return self._df
+
     def _read_gzipped_file(self, file_path: str) -> pd.DataFrame:
         """
         Read a gzipped file into a pandas dataframe
@@ -140,12 +150,33 @@ def _read_file_pd(self, *args, **kwargs) -> pd.DataFrame:
                 if row_count > 0:
                     _LOGGER.info(f"Skipped {row_count} rows while standardization. File: '{args}'")
                 df = df.dropna(axis=1)
-                return df
+                for index, row in df.iterrows():
+                    if (
+                        isinstance(row[0], str)
+                        and isinstance(row[1], int)
+                        and isinstance(row[2], int)
+                    ):
+                        return df
+                    else:
+                        if isinstance(row[1], str):
+                            try:
+                                _ = int(row[1])
+                                df[1] = pd.to_numeric(df[1])
+                            except ValueError:
+                                row_count += 1
+                                break
+                        if isinstance(row[2], str):
+                            try:
+                                _ = int(row[2])
+                                df[2] = pd.to_numeric(df[2])
+                            except ValueError:
+                                row_count += 1
+                                break
+                        return df
             except (pd.errors.ParserError, pd.errors.EmptyDataError) as _:
                 if row_count <= max_rows:
                     row_count += 1
-            # if can't open file after 5 attempts try to open it with gzip
-        return self._read_gzipped_file(*args)
+        raise BEDFileReadError("Cannot read bed file.")
 
     def __len__(self):
         return self.length

diff --git a/tests/data/io_data/bed/s1_a_coments.bed b/tests/data/io_data/bed/s1_a_coments.bed
@@ -0,0 +1,7 @@
+# THIS is big header
+# with 4 lines
+# and 3rd line
+# is empty
+chr1	10	30
+chr1	110	130
+chr1	210	230
diff --git a/tests/data/io_data/bed/s1_a_headers.bed b/tests/data/io_data/bed/s1_a_headers.bed
@@ -1,7 +1,4 @@
-# THIS is big header
-# with 4 lines
-# and 3rd line
-# is empty
+chrom_name  one two
 chr1	10	30
 chr1	110	130
 chr1	210	230
diff --git a/tests/data/io_data/bed_bad/s1_empty.bed b/tests/data/io_data/bed_bad/s1_empty.bed
diff --git a/tests/data/io_data/bed_bad/s1_many_headers.bed b/tests/data/io_data/bed_bad/s1_many_headers.bed
@@ -0,0 +1,11 @@
+# THIS is big header
+# with 4 lines
+# and 3rd line
+# is empty
+# THIS is big header
+# with 4 lines
+# and 3rd line
+# is empty
+chr1	10	30
+chr1	110	130
+chr1	210	230
diff --git a/tests/data/io_data/bed/s1_a_bed_gz → tests/data/io_data/s1_a_bed_gz b/tests/data/io_data/bed/s1_a_bed_gz → tests/data/io_data/s1_a_bed_gz
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -1,9 +1,10 @@
 import os
 
 import genomicranges
+import pandas as pd
 import pytest
 
-from geniml.io.exceptions import GenimlBaseError
+from geniml.io.exceptions import GenimlBaseError, BEDFileReadError
 from geniml.io.io import SNP, Maf, Region, RegionSet
 
 DATA_TEST_FOLDER = os.path.join(
@@ -14,11 +15,15 @@
 )
 DATA_TEST_FOLDER_BED = os.path.join(DATA_TEST_FOLDER, "bed")
 DATA_TEST_FOLDER_MAF = os.path.join(DATA_TEST_FOLDER, "maf")
+DATA_TEST_FOLDER_BED_BAD = os.path.join(DATA_TEST_FOLDER, "bed_bad")
 
 ALL_BEDFILE_PATH = [
     os.path.join(DATA_TEST_FOLDER_BED, x) for x in os.listdir(DATA_TEST_FOLDER_BED)
 ]
 ALL_MAF_PATH = [os.path.join(DATA_TEST_FOLDER_MAF, x) for x in os.listdir(DATA_TEST_FOLDER_MAF)]
+ALL_BADFILE_BAD_PATH = [
+    os.path.join(DATA_TEST_FOLDER_BED_BAD, x) for x in os.listdir(DATA_TEST_FOLDER_BED_BAD)
+]
 
 
 def test_make_region():
@@ -51,7 +56,7 @@ class TestRegionSet:
     @pytest.mark.parametrize(
         "url",
         [
-            "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7666nnn/GSM7666464/suppl/GSM7666464_18134-282-06_S51_L003_peaks.narrowPeak.gz"
+            "https://github.com/databio/geniml/raw/master/tests/data/io_data/bed/s1_a.bed.gz",
         ],
     )
     def test_region_set_from_url(self, url):
@@ -70,11 +75,15 @@ def test_region_set_from_path(self, url):
             assert isinstance(region, Region)
             break
 
+    @pytest.mark.parametrize("path", ALL_BADFILE_BAD_PATH)
+    def test_broken_bed_from_path(self, path):
+        with pytest.raises(BEDFileReadError):
+            region_set = RegionSet(path)
+
     @pytest.mark.parametrize(
         "url",
-        [
-            "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7666nnn/GSM7666464/suppl/GSM7666464_18134-282-06_S51_L003_peaks.narrowPeak.gz"
-        ],  # This is not the right way how to do it!
+        ["https://github.com/databio/geniml/raw/master/tests/data/io_data/bed/s1_a.bed.gz"],
+        # TODO: This is not the right way how to do it!
     )
     def test_region_set_from_url_cant_be_backed(self, url):
         with pytest.raises(GenimlBaseError):
@@ -105,6 +114,18 @@ def test_calculation_id(self):
         assert len(bedfile_id_2) == 32
         assert bedfile_id_1 == bedfile_id_2 == bedfile_id_3
 
+    @pytest.mark.parametrize("url", ALL_BEDFILE_PATH)
+    def test_to_df(self, url):
+        region_set = RegionSet(url, backed=False)
+        pandas_df = region_set.to_pandas()
+        assert isinstance(pandas_df, pd.DataFrame)
+
+    @pytest.mark.parametrize("url", ALL_BEDFILE_PATH)
+    def test_to_df_backed(self, url):
+        region_set = RegionSet(url, backed=True)
+        pandas_df = region_set.to_pandas()
+        assert isinstance(pandas_df, pd.DataFrame)
+
 
 class TestMaff:
     @pytest.mark.parametrize("path", ALL_MAF_PATH)