From 0490dd1671f1b938d67f922ee5018f7f5d2f25df Mon Sep 17 00:00:00 2001
From: Khoroshevskyi <sasha99250@gmail.com>
Date: Wed, 18 Sep 2024 15:34:33 -0400
Subject: [PATCH] moved opening bed file to geniml

---
 .../{refgenomevalidator.py => main.py}        |  0
 bedboss/refgenome_validator/utils.py          | 92 ++-----------------
 requirements/requirements-all.txt             |  4 +-
 test/test_ref_validator.py                    | 11 +--
 4 files changed, 13 insertions(+), 94 deletions(-)
 rename bedboss/refgenome_validator/{refgenomevalidator.py => main.py} (100%)

diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/main.py
similarity index 100%
rename from bedboss/refgenome_validator/refgenomevalidator.py
rename to bedboss/refgenome_validator/main.py
diff --git a/bedboss/refgenome_validator/utils.py b/bedboss/refgenome_validator/utils.py
index 502093f..0eb4d2b 100644
--- a/bedboss/refgenome_validator/utils.py
+++ b/bedboss/refgenome_validator/utils.py
@@ -1,99 +1,21 @@
 from typing import Union
-import pandas as pd
-from geniml.io.utils import is_gzipped
+from geniml.io import RegionSet
 import logging
 
-from bedboss.exceptions import BedBossException
-
 _LOGGER = logging.getLogger("bedboss")
 
 
-def _read_gzipped_file(file_path: str) -> pd.DataFrame:
-    """
-    !! Copy from geniml!
-    Read a gzipped file into a pandas dataframe
-
-    :param file_path: path to gzipped file
-    :return: pandas dataframe
-    """
-    return _read_file_pd(
-        file_path,
-        sep="\t",
-        compression="gzip",
-        header=None,
-        engine="pyarrow",
-    )
-
-
-def _read_file_pd(*args, **kwargs) -> pd.DataFrame:
-    """
-    !! Copy from geniml!
-    Read bed file into a pandas DataFrame, and skip header rows if needed
-
-    :return: pandas dataframe
+def get_bed_chrom_info(bedfile: Union[str, RegionSet]) -> dict:
     """
-    max_rows = 5
-    row_count = 0
-    while row_count <= max_rows:
-        try:
-            df = pd.read_csv(*args, **kwargs, skiprows=row_count)
-            if row_count > 0:
-                _LOGGER.info(
-                    f"Skipped {row_count} rows while standardization. File: '{args}'"
-                )
-            df = df.dropna(axis=1)
-            for index, row in df.iterrows():
-                if (
-                    isinstance(row[0], str)
-                    and isinstance(row[1], int)
-                    and isinstance(row[2], int)
-                ):
-                    return df
-                else:
-                    if isinstance(row[1], str):
-                        try:
-                            _ = int(row[1])
-                            df[1] = pd.to_numeric(df[1])
-                        except ValueError:
-                            row_count += 1
-                            break
-                    if isinstance(row[2], str):
-                        try:
-                            _ = int(row[2])
-                            df[2] = pd.to_numeric(df[2])
-                        except ValueError:
-                            row_count += 1
-                            break
-                    return df
-        except (pd.errors.ParserError, pd.errors.EmptyDataError) as _:
-            if row_count <= max_rows:
-                row_count += 1
-    raise BedfileReadException(reason="Cannot read bed file.")
+    Open bed file and find all of the chromosomes and the max length of each.
 
-
-def get_bed_chrom_info(bedfile: str) -> dict:
-    """
-    Attempt to open it and read it to find all of the chromosomes and the max length of each.
-
-    :param bedfile: bedfilepath
+    :param bedfile: RegionSet object or path to bed file
     returns dict: returns dictionary where keys are chrom names and values are the max end position of that chromosome.
     """
-    if is_gzipped(bedfile):
-        df = _read_gzipped_file(bedfile)
+    if isinstance(bedfile, RegionSet):
+        df = bedfile.to_pandas()
     else:
-        df = _read_file_pd(bedfile, sep="\t", header=None, engine="pyarrow")
+        df = RegionSet(bedfile).to_pandas()
 
     max_end_for_each_chrom = df.groupby(0)[2].max()
     return max_end_for_each_chrom.to_dict()
-
-
-class BedfileReadException(BedBossException):
-    """Exception when there is an exception during refgenome validation"""
-
-    def __init__(self, reason: str = ""):
-        """
-        Optionally provide explanation for exceptional condition.
-
-        :param str reason: some context why error occurred
-        """
-        super(BedfileReadException, self).__init__(reason)
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 2fb0151..521a177 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -1,6 +1,6 @@
 logmuse>=0.2.7
 coloredlogs>=15.0.1
-peppy>=0.40.5
+peppy>=0.40.6
 yacman>=0.8.4
 requests>=2.28.2
 piper>=v0.14.0
@@ -10,4 +10,4 @@ refgenconf>=0.12.2
 pandas>=2.0.0
 ubiquerg>=0.6.2
 pephubclient>=0.4.4
-geniml>=0.4.0
\ No newline at end of file
+geniml>=0.4.1
\ No newline at end of file
diff --git a/test/test_ref_validator.py b/test/test_ref_validator.py
index 9293534..bc5bd4f 100644
--- a/test/test_ref_validator.py
+++ b/test/test_ref_validator.py
@@ -9,12 +9,9 @@
 
 
 def test_main():
-    # ff = ReferenceValidator().determine_compatibility(
-    #     FILE_PATH,
-    #     concise=True,
-    # )
-    ff = ReferenceValidator().determine_compatibility(
-        "/home/bnt4me/.bbcache/bedfiles/3/2/GSE244926_mm39_LPx6_oligofile.bed.gz",
+    dict_result = ReferenceValidator().determine_compatibility(
+        FILE_PATH,
         concise=True,
     )
-    ff
+
+    assert dict_result