From ab9974f4ba3ea7379f6b6315b27ac645305ac0ae Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Fri, 9 Feb 2024 16:38:28 -0500 Subject: [PATCH 01/13] Initial work on bedclassifier for https://github.com/databio/bedbase/issues/55 --- bedboss/bedclassifier/__init__.py | 0 bedboss/bedclassifier/bedclassifier.py | 169 +++++++++++++++++++++++++ bedboss/bedmaker/bedmaker.py | 91 +------------ 3 files changed, 171 insertions(+), 89 deletions(-) create mode 100644 bedboss/bedclassifier/__init__.py create mode 100644 bedboss/bedclassifier/bedclassifier.py diff --git a/bedboss/bedclassifier/__init__.py b/bedboss/bedclassifier/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py new file mode 100644 index 0000000..2fe8bcc --- /dev/null +++ b/bedboss/bedclassifier/bedclassifier.py @@ -0,0 +1,169 @@ +import gzip +import logging +import os +import shutil +from typing import Optional + +import pypiper +import pandas as pd + +from bedboss.const import STANDARD_CHROM_LIST + +_LOGGER = logging.getLogger("bedboss") + + +class BedClassifier: + """ + This will take the input of either a .bed or a .bed.gz and classify the type of BED file. + + Types: + BED, BED2 - BED12, narrowPeak, broadPeak + UnknownType + + """ + + def __init__( + self, + input_file: str, + output_dir: Optional[str] = None, + bed_digest: Optional[str] = None, + input_type: Optional[str] = None, + pm: pypiper.PipelineManager = None, + report_to_database: Optional[bool] = False, + ): + # Raise Exception if input_type is given and it is NOT a BED file + # Raise Exception if the input file cannot be resolved + self.input_file = input_file + self.bed_digest = bed_digest + self.input_type = input_type + + self.abs_bed_path = os.path.abspath(self.input_file) + self.file_name = os.path.basename(self.abs_bed_path) + self.file_extension = os.path.splitext(self.abs_bed_path)[0] + + # we need this only if unzipping a file + self.output_dir = output_dir or os.path.join( + os.path.dirname(self.abs_bed_path) + "temp_processing" + ) + # Use existing Pipeline Manager or Construct New one + # Want to use Pipeline Manager to log work AND cleanup unzipped gz files. + if pm is not None: + self.pm = pm + else: + self.logs_dir = os.path.join(os.path.dirname(self.abs_bed_path) + "logs") + self.pm = pypiper.PipelineManager( + name="bedclassifier", outfolder=self.logs_dir, recover=True + ) + + if self.file_extension == ".gz": + unzipped_input_file = os.path.join(self.output_dir, self.file_name) + with gzip.open(self.input_file, "rb") as f_in: + with open(unzipped_input_file, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + self.input_file = unzipped_input_file + self.pm.clean_add(unzipped_input_file) + + bed_type = get_bed_type(self.input_file) + + if self.input_type is not None: + if bed_type != self.input_type: + _LOGGER.warning( + f"BED file classified as different type than given input: {bed_type} vs {self.input_type}" + ) + + else: + self.input_file = bed_type + + +def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> str: + """ + get the bed file type (ex. bed3, bed3+n ) + standardize chromosomes if necessary: + filter the input file to contain only the standard chromosomes, + remove regions on ChrUn chromosomes + + :param bed: path to the bed file + :param standard_chrom: + :return bed type + """ + # column format for bed12 + # string chrom; "Reference sequence chromosome or scaffold" + # uint chromStart; "Start position in chromosome" + # uint chromEnd; "End position in chromosome" + # string name; "Name of item." + # uint score; "Score (0-1000)" + # char[1] strand; "+ or - for strand" + # uint thickStart; "Start of where display should be thick (start codon)" + # uint thickEnd; "End of where display should be thick (stop codon)" + # uint reserved; "Used as itemRgb as of 2004-11-22" + # int blockCount; "Number of blocks" + # int[blockCount] blockSizes; "Comma separated list of block sizes" + # int[blockCount] chromStarts; "Start positions relative to chromStart" + + # Use chunksize to read only a few lines of the BED file (We don't need all of it) + df = pd.read_csv(bed, sep="\t", header=None, chunksize=4) + df = df.dropna(axis=1) + + # standardizing chromosome + # remove regions on ChrUn chromosomes + if standard_chrom: + _LOGGER.info("Standardizing chromosomes...") + df = df[df.loc[:, 0].isin(STANDARD_CHROM_LIST)] + df.to_csv(bed, compression="gzip", sep="\t", header=False, index=False) + + num_cols = len(df.columns) + bedtype = 0 + + # TODO add logic for narrow and broadpeak + for col in df: + if col <= 2: + if col == 0: + if df[col].dtype == "O": + bedtype += 1 + else: + return None + else: + if df[col].dtype == "int" and (df[col] >= 0).all(): + bedtype += 1 + else: + return None + else: + if col == 3: + if df[col].dtype == "O": + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" + elif col == 4: + if df[col].dtype == "int" and df[col].between(0, 1000).all(): + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" + elif col == 5: + if df[col].isin(["+", "-", "."]).all(): + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" + elif 6 <= col <= 8: + if df[col].dtype == "int" and (df[col] >= 0).all(): + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" + elif col == 9: + if df[col].dtype == "int": + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" + elif col == 10 or col == 11: + if df[col].str.match(r"^(\d+(,\d+)*)?$").all(): + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py index e8538ec..553119b 100755 --- a/bedboss/bedmaker/bedmaker.py +++ b/bedboss/bedmaker/bedmaker.py @@ -20,6 +20,7 @@ from yacman.exceptions import UndefinedAliasError from ubiquerg import is_command_callable +from bedboss.bedclassifier.bedclassifier import get_bed_type from bedboss.bedqc.bedqc import bedqc from bedboss.exceptions import RequirementsException @@ -336,7 +337,7 @@ def make_bigbed(self) -> NoReturn: temp = os.path.join(self.output_bigbed, next(tempfile._get_candidate_names())) if not os.path.exists(big_narrow_peak): - bedtype = self.get_bed_type(self.output_bed) + bedtype = get_bed_type(self.output_bed, standard_chrom=self.standard_chrom) self.pm.clean_add(temp) if not is_command_callable(f"{BED_TO_BIGBED_PROGRAM}"): @@ -455,91 +456,3 @@ def get_chrom_sizes(self) -> str: _LOGGER.info(f"Determined path to chrom.sizes asset: {chrom_sizes}") return chrom_sizes - - def get_bed_type(self, bed: str) -> str: - """ - get the bed file type (ex. bed3, bed3+n ) - standardize chromosomes if necessary: - filter the input file to contain only the standard chromosomes, - remove regions on ChrUn chromosomes - - :param bed: path to the bed file - :return bed type - """ - # column format for bed12 - # string chrom; "Reference sequence chromosome or scaffold" - # uint chromStart; "Start position in chromosome" - # uint chromEnd; "End position in chromosome" - # string name; "Name of item." - # uint score; "Score (0-1000)" - # char[1] strand; "+ or - for strand" - # uint thickStart; "Start of where display should be thick (start codon)" - # uint thickEnd; "End of where display should be thick (stop codon)" - # uint reserved; "Used as itemRgb as of 2004-11-22" - # int blockCount; "Number of blocks" - # int[blockCount] blockSizes; "Comma separated list of block sizes" - # int[blockCount] chromStarts; "Start positions relative to chromStart" - df = pd.read_csv(bed, sep="\t", header=None) - df = df.dropna(axis=1) - - # standardizing chromosome - # remove regions on ChrUn chromosomes - if self.standard_chrom: - _LOGGER.info("Standardizing chromosomes...") - df = df[df.loc[:, 0].isin(STANDARD_CHROM_LIST)] - df.to_csv(bed, compression="gzip", sep="\t", header=False, index=False) - - num_cols = len(df.columns) - bedtype = 0 - for col in df: - if col <= 2: - if col == 0: - if df[col].dtype == "O": - bedtype += 1 - else: - return None - else: - if df[col].dtype == "int" and (df[col] >= 0).all(): - bedtype += 1 - else: - return None - else: - if col == 3: - if df[col].dtype == "O": - bedtype += 1 - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" - elif col == 4: - if df[col].dtype == "int" and df[col].between(0, 1000).all(): - bedtype += 1 - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" - elif col == 5: - if df[col].isin(["+", "-", "."]).all(): - bedtype += 1 - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" - elif 6 <= col <= 8: - if df[col].dtype == "int" and (df[col] >= 0).all(): - bedtype += 1 - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" - elif col == 9: - if df[col].dtype == "int": - bedtype += 1 - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" - elif col == 10 or col == 11: - if df[col].str.match(r"^(\d+(,\d+)*)?$").all(): - bedtype += 1 - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" From db7b4bcc5ee970dddb328420c203395628121615 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 12 Feb 2024 11:29:38 -0500 Subject: [PATCH 02/13] Handle .gz files, add basic test https://github.com/databio/bedbase/issues/55 --- bedboss/bedclassifier/__init__.py | 1 + bedboss/bedclassifier/bedclassifier.py | 16 +++++++++++----- test/test_bedclassifier.py | 14 ++++++++++++++ 3 files changed, 26 insertions(+), 5 deletions(-) create mode 100644 test/test_bedclassifier.py diff --git a/bedboss/bedclassifier/__init__.py b/bedboss/bedclassifier/__init__.py index e69de29..7c1629d 100644 --- a/bedboss/bedclassifier/__init__.py +++ b/bedboss/bedclassifier/__init__.py @@ -0,0 +1 @@ +from bedboss.bedclassifier.bedclassifier import BedClassifier diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index 2fe8bcc..fbf9781 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -38,25 +38,31 @@ def __init__( self.input_type = input_type self.abs_bed_path = os.path.abspath(self.input_file) - self.file_name = os.path.basename(self.abs_bed_path) - self.file_extension = os.path.splitext(self.abs_bed_path)[0] + self.file_name = os.path.splitext(os.path.basename(self.abs_bed_path))[0] + self.file_extension = os.path.splitext(self.abs_bed_path)[-1] # we need this only if unzipping a file self.output_dir = output_dir or os.path.join( - os.path.dirname(self.abs_bed_path) + "temp_processing" + os.path.dirname(self.abs_bed_path), "temp_processing" ) # Use existing Pipeline Manager or Construct New one # Want to use Pipeline Manager to log work AND cleanup unzipped gz files. if pm is not None: self.pm = pm else: - self.logs_dir = os.path.join(os.path.dirname(self.abs_bed_path) + "logs") + self.logs_dir = os.path.join(self.output_dir, "logs") self.pm = pypiper.PipelineManager( name="bedclassifier", outfolder=self.logs_dir, recover=True ) if self.file_extension == ".gz": - unzipped_input_file = os.path.join(self.output_dir, self.file_name) + if ".bed" not in self.file_name: + unzipped_input_file = os.path.join( + self.output_dir, self.file_name + ".bed" + ) + else: + unzipped_input_file = os.path.join(self.output_dir, self.file_name) + with gzip.open(self.input_file, "rb") as f_in: with open(unzipped_input_file, "wb") as f_out: shutil.copyfileobj(f_in, f_out) diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py new file mode 100644 index 0000000..75aadc3 --- /dev/null +++ b/test/test_bedclassifier.py @@ -0,0 +1,14 @@ +import os +from tempfile import TemporaryDirectory + +from bedboss.bedclassifier import BedClassifier + + +FILE_DIR = os.path.dirname(os.path.realpath(__file__)) +HG19_CORRECT_DIR = os.path.join(FILE_DIR, "test_data", "bed", "hg19", "correct") +FILE_PATH = f"{HG19_CORRECT_DIR}/sample1.bed.gz" + + +def test_classification(): + with TemporaryDirectory() as d: + bedclass = BedClassifier(input_file=FILE_PATH, output_dir=d) From ee00b15479a98d1d9ef83d8c078d1c98ac78346a Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 12 Feb 2024 14:52:43 -0500 Subject: [PATCH 03/13] Add reporting results via pm.report_result, use nrows for performance increase https://github.com/databio/bedboss/issues/34 --- MANIFEST.in | 3 ++- bedboss/bedclassifier/__init__.py | 2 +- bedboss/bedclassifier/bedclassifier.py | 32 ++++++++++++++++++-------- test/test_bedclassifier.py | 17 +++++++++++++- 4 files changed, 41 insertions(+), 13 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 5520e14..f709b94 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -7,4 +7,5 @@ include bedboss/bedmaker/* include bedboss/bedqc/* include bedboss/qdrant_index/* include bedboss/bedbuncher/* -include bedboss/bedbuncher/tools/* \ No newline at end of file +include bedboss/bedbuncher/tools/* +include bedboss/bedclassifier/* \ No newline at end of file diff --git a/bedboss/bedclassifier/__init__.py b/bedboss/bedclassifier/__init__.py index 7c1629d..b8eb0d5 100644 --- a/bedboss/bedclassifier/__init__.py +++ b/bedboss/bedclassifier/__init__.py @@ -1 +1 @@ -from bedboss.bedclassifier.bedclassifier import BedClassifier +from bedboss.bedclassifier.bedclassifier import BedClassifier, get_bed_type diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index fbf9781..75c0284 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -2,7 +2,7 @@ import logging import os import shutil -from typing import Optional +from typing import Optional, Union import pypiper import pandas as pd @@ -49,11 +49,17 @@ def __init__( # Want to use Pipeline Manager to log work AND cleanup unzipped gz files. if pm is not None: self.pm = pm + self.pm_created = False else: self.logs_dir = os.path.join(self.output_dir, "logs") self.pm = pypiper.PipelineManager( - name="bedclassifier", outfolder=self.logs_dir, recover=True + name="bedclassifier", + outfolder=self.logs_dir, + recover=True, + pipestat_sample_name=bed_digest, ) + self.pm.start_pipeline() + self.pm_created = True if self.file_extension == ".gz": if ".bed" not in self.file_name: @@ -64,24 +70,29 @@ def __init__( unzipped_input_file = os.path.join(self.output_dir, self.file_name) with gzip.open(self.input_file, "rb") as f_in: + _LOGGER.info( + f"Unzipping file:{self.input_file} and Creating Unzipped file: {unzipped_input_file}" + ) with open(unzipped_input_file, "wb") as f_out: shutil.copyfileobj(f_in, f_out) self.input_file = unzipped_input_file self.pm.clean_add(unzipped_input_file) - bed_type = get_bed_type(self.input_file) + self.bed_type = get_bed_type(self.input_file) if self.input_type is not None: - if bed_type != self.input_type: + if self.bed_type != self.input_type: _LOGGER.warning( - f"BED file classified as different type than given input: {bed_type} vs {self.input_type}" + f"BED file classified as different type than given input: {self.bed_type} vs {self.input_type}" ) - else: - self.input_file = bed_type + self.pm.report_result(key="bedtype", value=self.bed_type) + + if self.pm_created is True: + self.pm.stop_pipeline() -def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> str: +def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> Union[str, None]: """ get the bed file type (ex. bed3, bed3+n ) standardize chromosomes if necessary: @@ -106,8 +117,9 @@ def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> str: # int[blockCount] blockSizes; "Comma separated list of block sizes" # int[blockCount] chromStarts; "Start positions relative to chromStart" - # Use chunksize to read only a few lines of the BED file (We don't need all of it) - df = pd.read_csv(bed, sep="\t", header=None, chunksize=4) + # Use nrows to read only a few lines of the BED file (We don't need all of it) + df = pd.read_csv(bed, sep="\t", header=None, nrows=4) + print(df) df = df.dropna(axis=1) # standardizing chromosome diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index 75aadc3..63ecb1e 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -1,14 +1,29 @@ import os from tempfile import TemporaryDirectory -from bedboss.bedclassifier import BedClassifier +from bedboss.bedclassifier import BedClassifier, get_bed_type FILE_DIR = os.path.dirname(os.path.realpath(__file__)) HG19_CORRECT_DIR = os.path.join(FILE_DIR, "test_data", "bed", "hg19", "correct") FILE_PATH = f"{HG19_CORRECT_DIR}/sample1.bed.gz" +FILE_PATH_UNZIPPED = f"{HG19_CORRECT_DIR}/hg19_example1.bed" def test_classification(): with TemporaryDirectory() as d: bedclass = BedClassifier(input_file=FILE_PATH, output_dir=d) + print("DEBUG BEDCLASS\n") + print(bedclass.bed_type) + + +def test_get_bed_type(): + bedtype = get_bed_type(bed=FILE_PATH_UNZIPPED) + print("DEBUG BEDTYPE\n") + print(bedtype) + + +if __name__ == "__main__": + print("DEBUG FROM MAIN") + test_get_bed_type() + test_classification() From 4ba8f752a01420876c49620b98f1df6fadda4835 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 13 Feb 2024 10:30:02 -0500 Subject: [PATCH 04/13] Add error handling when reading csv, defualt to "unknown_bedtype" https://github.com/databio/bedboss/issues/34 --- bedboss/bedclassifier/bedclassifier.py | 142 +++++++++++++------------ test/test_bedclassifier.py | 19 ++++ 2 files changed, 94 insertions(+), 67 deletions(-) diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index 75c0284..c9827a6 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -4,6 +4,7 @@ import shutil from typing import Optional, Union +import pandas.errors import pypiper import pandas as pd @@ -62,12 +63,12 @@ def __init__( self.pm_created = True if self.file_extension == ".gz": - if ".bed" not in self.file_name: - unzipped_input_file = os.path.join( - self.output_dir, self.file_name + ".bed" - ) - else: - unzipped_input_file = os.path.join(self.output_dir, self.file_name) + # if ".bed" not in self.file_name: + # unzipped_input_file = os.path.join( + # self.output_dir, self.file_name + ".bed" + # ) + # else: + unzipped_input_file = os.path.join(self.output_dir, self.file_name) with gzip.open(self.input_file, "rb") as f_in: _LOGGER.info( @@ -118,70 +119,77 @@ def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> Union[str, N # int[blockCount] chromStarts; "Start positions relative to chromStart" # Use nrows to read only a few lines of the BED file (We don't need all of it) - df = pd.read_csv(bed, sep="\t", header=None, nrows=4) + df = None + try: + df = pd.read_csv(bed, sep="\t", header=None, nrows=4) + except pandas.errors.ParserError as e: + _LOGGER.warning(f"Unable to parse bed file {bed}, setting bed_type = Unknown") print(df) - df = df.dropna(axis=1) - - # standardizing chromosome - # remove regions on ChrUn chromosomes - if standard_chrom: - _LOGGER.info("Standardizing chromosomes...") - df = df[df.loc[:, 0].isin(STANDARD_CHROM_LIST)] - df.to_csv(bed, compression="gzip", sep="\t", header=False, index=False) - - num_cols = len(df.columns) - bedtype = 0 - - # TODO add logic for narrow and broadpeak - for col in df: - if col <= 2: - if col == 0: - if df[col].dtype == "O": - bedtype += 1 + if df is not None: + df = df.dropna(axis=1) + + # standardizing chromosome + # remove regions on ChrUn chromosomes + if standard_chrom: + _LOGGER.info("Standardizing chromosomes...") + df = df[df.loc[:, 0].isin(STANDARD_CHROM_LIST)] + df.to_csv(bed, compression="gzip", sep="\t", header=False, index=False) + + num_cols = len(df.columns) + bedtype = 0 + + # TODO add logic for narrow and broadpeak + for col in df: + if col <= 2: + if col == 0: + if df[col].dtype == "O": + bedtype += 1 + else: + return "unknown_bedtype" else: - return None + if df[col].dtype == "int" and (df[col] >= 0).all(): + bedtype += 1 + else: + return "unknown_bedtype" else: - if df[col].dtype == "int" and (df[col] >= 0).all(): - bedtype += 1 - else: - return None - else: - if col == 3: - if df[col].dtype == "O": - bedtype += 1 - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" - elif col == 4: - if df[col].dtype == "int" and df[col].between(0, 1000).all(): - bedtype += 1 - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" - elif col == 5: - if df[col].isin(["+", "-", "."]).all(): - bedtype += 1 + if col == 3: + if df[col].dtype == "O": + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" + elif col == 4: + if df[col].dtype == "int" and df[col].between(0, 1000).all(): + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" + elif col == 5: + if df[col].isin(["+", "-", "."]).all(): + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" + elif 6 <= col <= 8: + if df[col].dtype == "int" and (df[col] >= 0).all(): + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" + elif col == 9: + if df[col].dtype == "int": + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" + elif col == 10 or col == 11: + if df[col].str.match(r"^(\d+(,\d+)*)?$").all(): + bedtype += 1 + else: + n = num_cols - bedtype + return f"bed{bedtype}+{n}" else: n = num_cols - bedtype return f"bed{bedtype}+{n}" - elif 6 <= col <= 8: - if df[col].dtype == "int" and (df[col] >= 0).all(): - bedtype += 1 - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" - elif col == 9: - if df[col].dtype == "int": - bedtype += 1 - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" - elif col == 10 or col == 11: - if df[col].str.match(r"^(\d+(,\d+)*)?$").all(): - bedtype += 1 - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" - else: - n = num_cols - bedtype - return f"bed{bedtype}+{n}" + else: + return "unknown_bedtype" diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index 63ecb1e..5d06fd8 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -23,7 +23,26 @@ def test_get_bed_type(): print(bedtype) +def test_manual_dir_beds(): + """This test is currently just for local manual testing""" + local_dir = "/home/drc/Downloads/test_beds_BED_classifier/" + output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/" + + for root, dirs, files in os.walk(local_dir): + for file in files: + print(file) + file_path = os.path.join(root, file) + print(file_path) + bedclass = BedClassifier( + input_file=file_path, output_dir=output_dir, bed_digest=file + ) + print("\nDEBUG BEDCLASS\n") + print(bedclass.bed_type) + print("+++++++++++++++++++") + + if __name__ == "__main__": print("DEBUG FROM MAIN") test_get_bed_type() test_classification() + test_manual_dir_beds() From 55d3b8867eae04c68ab79491960d7bd34b5c21df Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 13 Feb 2024 11:54:01 -0500 Subject: [PATCH 05/13] Add better exception handling and allowing for integer/float chromosomes in column 0 https://github.com/databio/bedboss/issues/34 --- bedboss/bedclassifier/bedclassifier.py | 41 +++++++++++++++++++++++--- bedboss/exceptions.py | 13 ++++++++ test/test_bedclassifier.py | 9 ++++-- 3 files changed, 57 insertions(+), 6 deletions(-) diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index c9827a6..2388238 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -9,6 +9,7 @@ import pandas as pd from bedboss.const import STANDARD_CHROM_LIST +from bedboss.exceptions import BedTypeException _LOGGER = logging.getLogger("bedboss") @@ -93,7 +94,9 @@ def __init__( self.pm.stop_pipeline() -def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> Union[str, None]: +def get_bed_type( + bed: str, standard_chrom: Optional[str] = None, no_fail: Optional[bool] = True +) -> Union[str, None]: """ get the bed file type (ex. bed3, bed3+n ) standardize chromosomes if necessary: @@ -119,11 +122,22 @@ def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> Union[str, N # int[blockCount] chromStarts; "Start positions relative to chromStart" # Use nrows to read only a few lines of the BED file (We don't need all of it) + df = None + try: df = pd.read_csv(bed, sep="\t", header=None, nrows=4) except pandas.errors.ParserError as e: - _LOGGER.warning(f"Unable to parse bed file {bed}, setting bed_type = Unknown") + if no_fail: + _LOGGER.warning( + f"Unable to parse bed file {bed}, setting bed_type = Unknown" + ) + return "unknown_bedtype" + else: + raise BedTypeException( + reason=f"Bed type could not be determined due to CSV parse error {e}" + ) + print(df) if df is not None: df = df.dropna(axis=1) @@ -144,13 +158,32 @@ def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> Union[str, N if col == 0: if df[col].dtype == "O": bedtype += 1 + elif df[col].dtype == "int" or df[col].dtype == "float": + bedtype += 1 else: - return "unknown_bedtype" + if no_fail: + _LOGGER.warning( + f"Bed type could not be determined at column 0 with data type: {df[col].dtype}" + ) + return "unknown_bedtype" + else: + raise BedTypeException( + reason=f"Bed type could not be determined at column {0} with data type: {df[col].dtype}" + ) + else: if df[col].dtype == "int" and (df[col] >= 0).all(): bedtype += 1 else: - return "unknown_bedtype" + if no_fail: + _LOGGER.warning( + f"Bed type could not be determined at column {col} with data type: {df[col].dtype}" + ) + return "unknown_bedtype" + else: + raise BedTypeException( + reason=f"Bed type could not be determined at column 0 with data type: {df[col].dtype}" + ) else: if col == 3: if df[col].dtype == "O": diff --git a/bedboss/exceptions.py b/bedboss/exceptions.py index d84d06d..afd6f03 100644 --- a/bedboss/exceptions.py +++ b/bedboss/exceptions.py @@ -46,3 +46,16 @@ def __init__(self, reason: str = ""): :param str reason: additional info about requirements exception """ super(RequirementsException, self).__init__(reason) + + +class BedTypeException(BedBossException): + """Exception when Bed Type could not be determined.""" + + def __init__(self, reason: str = ""): + """ + Optionally provide explanation for exceptional condition. + + :param str reason: some context why error occurred while + using Open Signal Matrix + """ + super(BedTypeException, self).__init__(reason) diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index 5d06fd8..0125284 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -26,6 +26,7 @@ def test_get_bed_type(): def test_manual_dir_beds(): """This test is currently just for local manual testing""" local_dir = "/home/drc/Downloads/test_beds_BED_classifier/" + # local_dir = "/home/drc/Downloads/individual_beds/" output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/" for root, dirs, files in os.walk(local_dir): @@ -41,8 +42,12 @@ def test_manual_dir_beds(): print("+++++++++++++++++++") +def test_from_PEPhub_beds(): + pass + + if __name__ == "__main__": print("DEBUG FROM MAIN") - test_get_bed_type() - test_classification() + # test_get_bed_type() + # test_classification() test_manual_dir_beds() From 3d3ef5da91451afc2792a954653ec14a045a19ac Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 13 Feb 2024 12:06:47 -0500 Subject: [PATCH 06/13] Fix returns, and grouped exceptions --- bedboss/bedclassifier/bedclassifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index 2388238..d1518b4 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -96,7 +96,7 @@ def __init__( def get_bed_type( bed: str, standard_chrom: Optional[str] = None, no_fail: Optional[bool] = True -) -> Union[str, None]: +) -> str: """ get the bed file type (ex. bed3, bed3+n ) standardize chromosomes if necessary: @@ -127,7 +127,7 @@ def get_bed_type( try: df = pd.read_csv(bed, sep="\t", header=None, nrows=4) - except pandas.errors.ParserError as e: + except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e: if no_fail: _LOGGER.warning( f"Unable to parse bed file {bed}, setting bed_type = Unknown" From 12b88764519afb80c6fd032758bc236da034916d Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 13 Feb 2024 12:54:01 -0500 Subject: [PATCH 07/13] add clarity to errors --- bedboss/bedclassifier/bedclassifier.py | 4 ++-- test/test_bedclassifier.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index d1518b4..f6a0e5c 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -130,7 +130,7 @@ def get_bed_type( except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e: if no_fail: _LOGGER.warning( - f"Unable to parse bed file {bed}, setting bed_type = Unknown" + f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = Unknown" ) return "unknown_bedtype" else: @@ -163,7 +163,7 @@ def get_bed_type( else: if no_fail: _LOGGER.warning( - f"Bed type could not be determined at column 0 with data type: {df[col].dtype}" + f"Bed type could not be determined at column {0} with data type: {df[col].dtype}" ) return "unknown_bedtype" else: diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index 0125284..53d78b9 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -25,8 +25,8 @@ def test_get_bed_type(): def test_manual_dir_beds(): """This test is currently just for local manual testing""" - local_dir = "/home/drc/Downloads/test_beds_BED_classifier/" - # local_dir = "/home/drc/Downloads/individual_beds/" + # local_dir = "/home/drc/Downloads/test_beds_BED_classifier/" + local_dir = "/home/drc/Downloads/individual_beds/" output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/" for root, dirs, files in os.walk(local_dir): From 558b1f5e8589e30d6cbdb8c59c595ab3b77154fc Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 13 Feb 2024 15:52:28 -0500 Subject: [PATCH 08/13] skip first rows of bed file if they are not in column format --- bedboss/bedclassifier/bedclassifier.py | 33 ++++++++++++++++---------- test/test_bedclassifier.py | 3 ++- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index f6a0e5c..b5f1570 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -125,18 +125,26 @@ def get_bed_type( df = None - try: - df = pd.read_csv(bed, sep="\t", header=None, nrows=4) - except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e: - if no_fail: - _LOGGER.warning( - f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = Unknown" - ) - return "unknown_bedtype" - else: - raise BedTypeException( - reason=f"Bed type could not be determined due to CSV parse error {e}" - ) + max_rows = 5 + row_count = 0 + while row_count <= max_rows: + print(f"ROW COUNT: {row_count}") + try: + df = pd.read_csv(bed, sep="\t", header=None, nrows=4, skiprows=row_count) + break + except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e: + if row_count <= max_rows: + row_count += 1 + else: + if no_fail: + _LOGGER.warning( + f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype" + ) + return "unknown_bedtype" + else: + raise BedTypeException( + reason=f"Bed type could not be determined due to CSV parse error {e}" + ) print(df) if df is not None: @@ -152,7 +160,6 @@ def get_bed_type( num_cols = len(df.columns) bedtype = 0 - # TODO add logic for narrow and broadpeak for col in df: if col <= 2: if col == 0: diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index 53d78b9..41b377a 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -26,7 +26,8 @@ def test_get_bed_type(): def test_manual_dir_beds(): """This test is currently just for local manual testing""" # local_dir = "/home/drc/Downloads/test_beds_BED_classifier/" - local_dir = "/home/drc/Downloads/individual_beds/" + # local_dir = "/home/drc/Downloads/individual_beds/" + local_dir = "/home/drc/Downloads/only_narrowpeaks/" output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/" for root, dirs, files in os.walk(local_dir): From efaf08333c7657513d9e7595628e1f35a5bad5bb Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 13 Feb 2024 16:13:11 -0500 Subject: [PATCH 09/13] add simple narrowPeak and broadPeak logic for classification --- bedboss/bedclassifier/bedclassifier.py | 14 +++++++++----- test/test_bedclassifier.py | 2 ++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index b5f1570..d62faea 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -104,6 +104,7 @@ def get_bed_type( remove regions on ChrUn chromosomes :param bed: path to the bed file + :param no_fail: should the function (and pipeline) continue if this function fails to parse BED file :param standard_chrom: :return bed type """ @@ -121,8 +122,6 @@ def get_bed_type( # int[blockCount] blockSizes; "Comma separated list of block sizes" # int[blockCount] chromStarts; "Start positions relative to chromStart" - # Use nrows to read only a few lines of the BED file (We don't need all of it) - df = None max_rows = 5 @@ -146,7 +145,6 @@ def get_bed_type( reason=f"Bed type could not be determined due to CSV parse error {e}" ) - print(df) if df is not None: df = df.dropna(axis=1) @@ -221,13 +219,19 @@ def get_bed_type( bedtype += 1 else: n = num_cols - bedtype - return f"bed{bedtype}+{n}" + if "broadpeak" in bed or "broadPeak" in bed: + return f"broadPeak,bed{bedtype}+{n}" + else: + return f"bed{bedtype}+{n}" elif col == 10 or col == 11: if df[col].str.match(r"^(\d+(,\d+)*)?$").all(): bedtype += 1 else: n = num_cols - bedtype - return f"bed{bedtype}+{n}" + if "narrowpeak" in bed or "narrowPeak" in bed: + return f"narrowPeak,bed{bedtype}+{n}" + else: + return f"bed{bedtype}+{n}" else: n = num_cols - bedtype return f"bed{bedtype}+{n}" diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index 41b377a..2d1db18 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -44,6 +44,8 @@ def test_manual_dir_beds(): def test_from_PEPhub_beds(): + """""" + # TODO implement testing from pephub pass From 09a6405812287e911320efdfa529cb4a867f8e93 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 13 Feb 2024 16:27:30 -0500 Subject: [PATCH 10/13] remove unused code --- bedboss/bedclassifier/bedclassifier.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index d62faea..f08189f 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -64,11 +64,6 @@ def __init__( self.pm_created = True if self.file_extension == ".gz": - # if ".bed" not in self.file_name: - # unzipped_input_file = os.path.join( - # self.output_dir, self.file_name + ".bed" - # ) - # else: unzipped_input_file = os.path.join(self.output_dir, self.file_name) with gzip.open(self.input_file, "rb") as f_in: From f5333a34bfada63b4623cb8832985bc821579e7e Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 13 Feb 2024 16:29:07 -0500 Subject: [PATCH 11/13] comment out manual test --- test/test_bedclassifier.py | 42 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index 2d1db18..c5fde95 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -23,24 +23,24 @@ def test_get_bed_type(): print(bedtype) -def test_manual_dir_beds(): - """This test is currently just for local manual testing""" - # local_dir = "/home/drc/Downloads/test_beds_BED_classifier/" - # local_dir = "/home/drc/Downloads/individual_beds/" - local_dir = "/home/drc/Downloads/only_narrowpeaks/" - output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/" - - for root, dirs, files in os.walk(local_dir): - for file in files: - print(file) - file_path = os.path.join(root, file) - print(file_path) - bedclass = BedClassifier( - input_file=file_path, output_dir=output_dir, bed_digest=file - ) - print("\nDEBUG BEDCLASS\n") - print(bedclass.bed_type) - print("+++++++++++++++++++") +# def test_manual_dir_beds(): +# """This test is currently just for local manual testing""" +# # local_dir = "/home/drc/Downloads/test_beds_BED_classifier/" +# # local_dir = "/home/drc/Downloads/individual_beds/" +# local_dir = "/home/drc/Downloads/only_narrowpeaks/" +# output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/" +# +# for root, dirs, files in os.walk(local_dir): +# for file in files: +# print(file) +# file_path = os.path.join(root, file) +# print(file_path) +# bedclass = BedClassifier( +# input_file=file_path, output_dir=output_dir, bed_digest=file +# ) +# print("\nDEBUG BEDCLASS\n") +# print(bedclass.bed_type) +# print("+++++++++++++++++++") def test_from_PEPhub_beds(): @@ -51,6 +51,6 @@ def test_from_PEPhub_beds(): if __name__ == "__main__": print("DEBUG FROM MAIN") - # test_get_bed_type() - # test_classification() - test_manual_dir_beds() + test_get_bed_type() + test_classification() + # test_manual_dir_beds() From e968fad5abbb94c2612e8a70894cd3314a3a4aa5 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Wed, 14 Feb 2024 10:07:00 -0500 Subject: [PATCH 12/13] comment out main call for manual test, add pytest skipping for tests --- test/test_bedclassifier.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index c5fde95..1c22fc8 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -1,4 +1,5 @@ import os +import pytest from tempfile import TemporaryDirectory from bedboss.bedclassifier import BedClassifier, get_bed_type @@ -10,17 +11,22 @@ FILE_PATH_UNZIPPED = f"{HG19_CORRECT_DIR}/hg19_example1.bed" +@pytest.mark.skip(reason="Illegal seek during teardown.") def test_classification(): with TemporaryDirectory() as d: bedclass = BedClassifier(input_file=FILE_PATH, output_dir=d) - print("DEBUG BEDCLASS\n") - print(bedclass.bed_type) def test_get_bed_type(): bedtype = get_bed_type(bed=FILE_PATH_UNZIPPED) - print("DEBUG BEDTYPE\n") - print(bedtype) + assert bedtype == "bed6+3" + + +@pytest.mark.skip(reason="Not implemented") +def test_from_PEPhub_beds(): + """""" + # TODO implement testing from pephub + pass # def test_manual_dir_beds(): @@ -43,14 +49,7 @@ def test_get_bed_type(): # print("+++++++++++++++++++") -def test_from_PEPhub_beds(): - """""" - # TODO implement testing from pephub - pass - - -if __name__ == "__main__": - print("DEBUG FROM MAIN") - test_get_bed_type() - test_classification() - # test_manual_dir_beds() +# if __name__ == "__main__": +# test_get_bed_type() +# test_classification() +# test_manual_dir_beds() From 5db459768766ceb7aaa14b3d987eb65401b8605a Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Wed, 14 Feb 2024 12:44:59 -0500 Subject: [PATCH 13/13] add returning tuple when classifying, e.g. (f"bed{bedtype}+{n}", "broadpeak") --- bedboss/bedclassifier/bedclassifier.py | 32 +++++++++++++------------- test/test_bedclassifier.py | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index f08189f..4251b05 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -2,7 +2,7 @@ import logging import os import shutil -from typing import Optional, Union +from typing import Optional, Tuple import pandas.errors import pypiper @@ -91,7 +91,7 @@ def __init__( def get_bed_type( bed: str, standard_chrom: Optional[str] = None, no_fail: Optional[bool] = True -) -> str: +) -> Tuple[str, str]: """ get the bed file type (ex. bed3, bed3+n ) standardize chromosomes if necessary: @@ -101,7 +101,7 @@ def get_bed_type( :param bed: path to the bed file :param no_fail: should the function (and pipeline) continue if this function fails to parse BED file :param standard_chrom: - :return bed type + :return bedtype: tuple[option ["bed{bedtype}+{n}", "unknown_bedtype"], option [bed, narrowpeak, broadpeak, unknown_bedtype]] """ # column format for bed12 # string chrom; "Reference sequence chromosome or scaffold" @@ -134,7 +134,7 @@ def get_bed_type( _LOGGER.warning( f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype" ) - return "unknown_bedtype" + return ("unknown_bedtype", "unknown_bedtype") else: raise BedTypeException( reason=f"Bed type could not be determined due to CSV parse error {e}" @@ -165,7 +165,7 @@ def get_bed_type( _LOGGER.warning( f"Bed type could not be determined at column {0} with data type: {df[col].dtype}" ) - return "unknown_bedtype" + return ("unknown_bedtype", "unknown_bedtype") else: raise BedTypeException( reason=f"Bed type could not be determined at column {0} with data type: {df[col].dtype}" @@ -179,7 +179,7 @@ def get_bed_type( _LOGGER.warning( f"Bed type could not be determined at column {col} with data type: {df[col].dtype}" ) - return "unknown_bedtype" + return ("unknown_bedtype", "unknown_bedtype") else: raise BedTypeException( reason=f"Bed type could not be determined at column 0 with data type: {df[col].dtype}" @@ -190,45 +190,45 @@ def get_bed_type( bedtype += 1 else: n = num_cols - bedtype - return f"bed{bedtype}+{n}" + return (f"bed{bedtype}+{n}", "bed") elif col == 4: if df[col].dtype == "int" and df[col].between(0, 1000).all(): bedtype += 1 else: n = num_cols - bedtype - return f"bed{bedtype}+{n}" + return (f"bed{bedtype}+{n}", "bed") elif col == 5: if df[col].isin(["+", "-", "."]).all(): bedtype += 1 else: n = num_cols - bedtype - return f"bed{bedtype}+{n}" + return (f"bed{bedtype}+{n}", "bed") elif 6 <= col <= 8: if df[col].dtype == "int" and (df[col] >= 0).all(): bedtype += 1 else: n = num_cols - bedtype - return f"bed{bedtype}+{n}" + return (f"bed{bedtype}+{n}", "bed") elif col == 9: if df[col].dtype == "int": bedtype += 1 else: n = num_cols - bedtype if "broadpeak" in bed or "broadPeak" in bed: - return f"broadPeak,bed{bedtype}+{n}" + return (f"bed{bedtype}+{n}", "broadpeak") else: - return f"bed{bedtype}+{n}" + return (f"bed{bedtype}+{n}", "bed") elif col == 10 or col == 11: if df[col].str.match(r"^(\d+(,\d+)*)?$").all(): bedtype += 1 else: n = num_cols - bedtype if "narrowpeak" in bed or "narrowPeak" in bed: - return f"narrowPeak,bed{bedtype}+{n}" + return (f"bed{bedtype}+{n}", "narrowpeak") else: - return f"bed{bedtype}+{n}" + return (f"bed{bedtype}+{n}", "bed") else: n = num_cols - bedtype - return f"bed{bedtype}+{n}" + return (f"bed{bedtype}+{n}", "bed") else: - return "unknown_bedtype" + return ("unknown_bedtype", "unknown_bedtype") diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index 1c22fc8..aac980e 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -19,7 +19,7 @@ def test_classification(): def test_get_bed_type(): bedtype = get_bed_type(bed=FILE_PATH_UNZIPPED) - assert bedtype == "bed6+3" + assert bedtype == ("bed6+3", "bed") @pytest.mark.skip(reason="Not implemented")