From 0490dd1671f1b938d67f922ee5018f7f5d2f25df Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 18 Sep 2024 15:34:33 -0400 Subject: [PATCH] moved opening bed file to geniml --- .../{refgenomevalidator.py => main.py} | 0 bedboss/refgenome_validator/utils.py | 92 ++----------------- requirements/requirements-all.txt | 4 +- test/test_ref_validator.py | 11 +-- 4 files changed, 13 insertions(+), 94 deletions(-) rename bedboss/refgenome_validator/{refgenomevalidator.py => main.py} (100%) diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/main.py similarity index 100% rename from bedboss/refgenome_validator/refgenomevalidator.py rename to bedboss/refgenome_validator/main.py diff --git a/bedboss/refgenome_validator/utils.py b/bedboss/refgenome_validator/utils.py index 502093f..0eb4d2b 100644 --- a/bedboss/refgenome_validator/utils.py +++ b/bedboss/refgenome_validator/utils.py @@ -1,99 +1,21 @@ from typing import Union -import pandas as pd -from geniml.io.utils import is_gzipped +from geniml.io import RegionSet import logging -from bedboss.exceptions import BedBossException - _LOGGER = logging.getLogger("bedboss") -def _read_gzipped_file(file_path: str) -> pd.DataFrame: - """ - !! Copy from geniml! - Read a gzipped file into a pandas dataframe - - :param file_path: path to gzipped file - :return: pandas dataframe - """ - return _read_file_pd( - file_path, - sep="\t", - compression="gzip", - header=None, - engine="pyarrow", - ) - - -def _read_file_pd(*args, **kwargs) -> pd.DataFrame: - """ - !! Copy from geniml! - Read bed file into a pandas DataFrame, and skip header rows if needed - - :return: pandas dataframe +def get_bed_chrom_info(bedfile: Union[str, RegionSet]) -> dict: """ - max_rows = 5 - row_count = 0 - while row_count <= max_rows: - try: - df = pd.read_csv(*args, **kwargs, skiprows=row_count) - if row_count > 0: - _LOGGER.info( - f"Skipped {row_count} rows while standardization. File: '{args}'" - ) - df = df.dropna(axis=1) - for index, row in df.iterrows(): - if ( - isinstance(row[0], str) - and isinstance(row[1], int) - and isinstance(row[2], int) - ): - return df - else: - if isinstance(row[1], str): - try: - _ = int(row[1]) - df[1] = pd.to_numeric(df[1]) - except ValueError: - row_count += 1 - break - if isinstance(row[2], str): - try: - _ = int(row[2]) - df[2] = pd.to_numeric(df[2]) - except ValueError: - row_count += 1 - break - return df - except (pd.errors.ParserError, pd.errors.EmptyDataError) as _: - if row_count <= max_rows: - row_count += 1 - raise BedfileReadException(reason="Cannot read bed file.") + Open bed file and find all of the chromosomes and the max length of each. - -def get_bed_chrom_info(bedfile: str) -> dict: - """ - Attempt to open it and read it to find all of the chromosomes and the max length of each. - - :param bedfile: bedfilepath + :param bedfile: RegionSet object or path to bed file returns dict: returns dictionary where keys are chrom names and values are the max end position of that chromosome. """ - if is_gzipped(bedfile): - df = _read_gzipped_file(bedfile) + if isinstance(bedfile, RegionSet): + df = bedfile.to_pandas() else: - df = _read_file_pd(bedfile, sep="\t", header=None, engine="pyarrow") + df = RegionSet(bedfile).to_pandas() max_end_for_each_chrom = df.groupby(0)[2].max() return max_end_for_each_chrom.to_dict() - - -class BedfileReadException(BedBossException): - """Exception when there is an exception during refgenome validation""" - - def __init__(self, reason: str = ""): - """ - Optionally provide explanation for exceptional condition. - - :param str reason: some context why error occurred - """ - super(BedfileReadException, self).__init__(reason) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 2fb0151..521a177 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,6 +1,6 @@ logmuse>=0.2.7 coloredlogs>=15.0.1 -peppy>=0.40.5 +peppy>=0.40.6 yacman>=0.8.4 requests>=2.28.2 piper>=v0.14.0 @@ -10,4 +10,4 @@ refgenconf>=0.12.2 pandas>=2.0.0 ubiquerg>=0.6.2 pephubclient>=0.4.4 -geniml>=0.4.0 \ No newline at end of file +geniml>=0.4.1 \ No newline at end of file diff --git a/test/test_ref_validator.py b/test/test_ref_validator.py index 9293534..bc5bd4f 100644 --- a/test/test_ref_validator.py +++ b/test/test_ref_validator.py @@ -9,12 +9,9 @@ def test_main(): - # ff = ReferenceValidator().determine_compatibility( - # FILE_PATH, - # concise=True, - # ) - ff = ReferenceValidator().determine_compatibility( - "/home/bnt4me/.bbcache/bedfiles/3/2/GSE244926_mm39_LPx6_oligofile.bed.gz", + dict_result = ReferenceValidator().determine_compatibility( + FILE_PATH, concise=True, ) - ff + + assert dict_result