Skip to content

Commit

Permalink
moved opening bed file to geniml
Browse files Browse the repository at this point in the history
  • Loading branch information
khoroshevskyi committed Sep 18, 2024
1 parent 1591178 commit 0490dd1
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 94 deletions.
File renamed without changes.
92 changes: 7 additions & 85 deletions bedboss/refgenome_validator/utils.py
Original file line number Diff line number Diff line change
@@ -1,99 +1,21 @@
from typing import Union
import pandas as pd
from geniml.io.utils import is_gzipped
from geniml.io import RegionSet
import logging

from bedboss.exceptions import BedBossException

_LOGGER = logging.getLogger("bedboss")


def _read_gzipped_file(file_path: str) -> pd.DataFrame:
"""
!! Copy from geniml!
Read a gzipped file into a pandas dataframe
:param file_path: path to gzipped file
:return: pandas dataframe
"""
return _read_file_pd(
file_path,
sep="\t",
compression="gzip",
header=None,
engine="pyarrow",
)


def _read_file_pd(*args, **kwargs) -> pd.DataFrame:
"""
!! Copy from geniml!
Read bed file into a pandas DataFrame, and skip header rows if needed
:return: pandas dataframe
def get_bed_chrom_info(bedfile: Union[str, RegionSet]) -> dict:
"""
max_rows = 5
row_count = 0
while row_count <= max_rows:
try:
df = pd.read_csv(*args, **kwargs, skiprows=row_count)
if row_count > 0:
_LOGGER.info(
f"Skipped {row_count} rows while standardization. File: '{args}'"
)
df = df.dropna(axis=1)
for index, row in df.iterrows():
if (
isinstance(row[0], str)
and isinstance(row[1], int)
and isinstance(row[2], int)
):
return df
else:
if isinstance(row[1], str):
try:
_ = int(row[1])
df[1] = pd.to_numeric(df[1])
except ValueError:
row_count += 1
break
if isinstance(row[2], str):
try:
_ = int(row[2])
df[2] = pd.to_numeric(df[2])
except ValueError:
row_count += 1
break
return df
except (pd.errors.ParserError, pd.errors.EmptyDataError) as _:
if row_count <= max_rows:
row_count += 1
raise BedfileReadException(reason="Cannot read bed file.")
Open bed file and find all of the chromosomes and the max length of each.

def get_bed_chrom_info(bedfile: str) -> dict:
"""
Attempt to open it and read it to find all of the chromosomes and the max length of each.
:param bedfile: bedfilepath
:param bedfile: RegionSet object or path to bed file
returns dict: returns dictionary where keys are chrom names and values are the max end position of that chromosome.
"""
if is_gzipped(bedfile):
df = _read_gzipped_file(bedfile)
if isinstance(bedfile, RegionSet):
df = bedfile.to_pandas()
else:
df = _read_file_pd(bedfile, sep="\t", header=None, engine="pyarrow")
df = RegionSet(bedfile).to_pandas()

max_end_for_each_chrom = df.groupby(0)[2].max()
return max_end_for_each_chrom.to_dict()


class BedfileReadException(BedBossException):
"""Exception when there is an exception during refgenome validation"""

def __init__(self, reason: str = ""):
"""
Optionally provide explanation for exceptional condition.
:param str reason: some context why error occurred
"""
super(BedfileReadException, self).__init__(reason)
4 changes: 2 additions & 2 deletions requirements/requirements-all.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
logmuse>=0.2.7
coloredlogs>=15.0.1
peppy>=0.40.5
peppy>=0.40.6
yacman>=0.8.4
requests>=2.28.2
piper>=v0.14.0
Expand All @@ -10,4 +10,4 @@ refgenconf>=0.12.2
pandas>=2.0.0
ubiquerg>=0.6.2
pephubclient>=0.4.4
geniml>=0.4.0
geniml>=0.4.1
11 changes: 4 additions & 7 deletions test/test_ref_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,9 @@


def test_main():
# ff = ReferenceValidator().determine_compatibility(
# FILE_PATH,
# concise=True,
# )
ff = ReferenceValidator().determine_compatibility(
"/home/bnt4me/.bbcache/bedfiles/3/2/GSE244926_mm39_LPx6_oligofile.bed.gz",
dict_result = ReferenceValidator().determine_compatibility(
FILE_PATH,
concise=True,
)
ff

assert dict_result

0 comments on commit 0490dd1

Please sign in to comment.