From ab9974f4ba3ea7379f6b6315b27ac645305ac0ae Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Fri, 9 Feb 2024 16:38:28 -0500
Subject: [PATCH 01/13] Initial work on bedclassifier for
 https://github.com/databio/bedbase/issues/55

---
 bedboss/bedclassifier/__init__.py      |   0
 bedboss/bedclassifier/bedclassifier.py | 169 +++++++++++++++++++++++++
 bedboss/bedmaker/bedmaker.py           |  91 +------------
 3 files changed, 171 insertions(+), 89 deletions(-)
 create mode 100644 bedboss/bedclassifier/__init__.py
 create mode 100644 bedboss/bedclassifier/bedclassifier.py

diff --git a/bedboss/bedclassifier/__init__.py b/bedboss/bedclassifier/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
new file mode 100644
index 0000000..2fe8bcc
--- /dev/null
+++ b/bedboss/bedclassifier/bedclassifier.py
@@ -0,0 +1,169 @@
+import gzip
+import logging
+import os
+import shutil
+from typing import Optional
+
+import pypiper
+import pandas as pd
+
+from bedboss.const import STANDARD_CHROM_LIST
+
+_LOGGER = logging.getLogger("bedboss")
+
+
+class BedClassifier:
+    """
+    This will take the input of either a .bed or a .bed.gz and classify the type of BED file.
+
+    Types:
+    BED, BED2 - BED12, narrowPeak, broadPeak
+    UnknownType
+
+    """
+
+    def __init__(
+        self,
+        input_file: str,
+        output_dir: Optional[str] = None,
+        bed_digest: Optional[str] = None,
+        input_type: Optional[str] = None,
+        pm: pypiper.PipelineManager = None,
+        report_to_database: Optional[bool] = False,
+    ):
+        # Raise Exception if input_type is given and it is NOT a BED file
+        # Raise Exception if the input file cannot be resolved
+        self.input_file = input_file
+        self.bed_digest = bed_digest
+        self.input_type = input_type
+
+        self.abs_bed_path = os.path.abspath(self.input_file)
+        self.file_name = os.path.basename(self.abs_bed_path)
+        self.file_extension = os.path.splitext(self.abs_bed_path)[0]
+
+        # we need this only if unzipping a file
+        self.output_dir = output_dir or os.path.join(
+            os.path.dirname(self.abs_bed_path) + "temp_processing"
+        )
+        # Use existing Pipeline Manager or Construct New one
+        # Want to use Pipeline Manager to log work AND cleanup unzipped gz files.
+        if pm is not None:
+            self.pm = pm
+        else:
+            self.logs_dir = os.path.join(os.path.dirname(self.abs_bed_path) + "logs")
+            self.pm = pypiper.PipelineManager(
+                name="bedclassifier", outfolder=self.logs_dir, recover=True
+            )
+
+        if self.file_extension == ".gz":
+            unzipped_input_file = os.path.join(self.output_dir, self.file_name)
+            with gzip.open(self.input_file, "rb") as f_in:
+                with open(unzipped_input_file, "wb") as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+            self.input_file = unzipped_input_file
+            self.pm.clean_add(unzipped_input_file)
+
+        bed_type = get_bed_type(self.input_file)
+
+        if self.input_type is not None:
+            if bed_type != self.input_type:
+                _LOGGER.warning(
+                    f"BED file classified as different type than given input: {bed_type} vs {self.input_type}"
+                )
+
+        else:
+            self.input_file = bed_type
+
+
+def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> str:
+    """
+    get the bed file type (ex. bed3, bed3+n )
+    standardize chromosomes if necessary:
+    filter the input file to contain only the standard chromosomes,
+    remove regions on ChrUn chromosomes
+
+    :param bed: path to the bed file
+    :param standard_chrom:
+    :return bed type
+    """
+    #    column format for bed12
+    #    string chrom;       "Reference sequence chromosome or scaffold"
+    #    uint   chromStart;  "Start position in chromosome"
+    #    uint   chromEnd;    "End position in chromosome"
+    #    string name;        "Name of item."
+    #    uint score;          "Score (0-1000)"
+    #    char[1] strand;     "+ or - for strand"
+    #    uint thickStart;   "Start of where display should be thick (start codon)"
+    #    uint thickEnd;     "End of where display should be thick (stop codon)"
+    #    uint reserved;     "Used as itemRgb as of 2004-11-22"
+    #    int blockCount;    "Number of blocks"
+    #    int[blockCount] blockSizes; "Comma separated list of block sizes"
+    #    int[blockCount] chromStarts; "Start positions relative to chromStart"
+
+    # Use chunksize to read only a few lines of the BED file (We don't need all of it)
+    df = pd.read_csv(bed, sep="\t", header=None, chunksize=4)
+    df = df.dropna(axis=1)
+
+    # standardizing chromosome
+    # remove regions on ChrUn chromosomes
+    if standard_chrom:
+        _LOGGER.info("Standardizing chromosomes...")
+        df = df[df.loc[:, 0].isin(STANDARD_CHROM_LIST)]
+        df.to_csv(bed, compression="gzip", sep="\t", header=False, index=False)
+
+    num_cols = len(df.columns)
+    bedtype = 0
+
+    # TODO add logic for narrow and broadpeak
+    for col in df:
+        if col <= 2:
+            if col == 0:
+                if df[col].dtype == "O":
+                    bedtype += 1
+                else:
+                    return None
+            else:
+                if df[col].dtype == "int" and (df[col] >= 0).all():
+                    bedtype += 1
+                else:
+                    return None
+        else:
+            if col == 3:
+                if df[col].dtype == "O":
+                    bedtype += 1
+                else:
+                    n = num_cols - bedtype
+                    return f"bed{bedtype}+{n}"
+            elif col == 4:
+                if df[col].dtype == "int" and df[col].between(0, 1000).all():
+                    bedtype += 1
+                else:
+                    n = num_cols - bedtype
+                    return f"bed{bedtype}+{n}"
+            elif col == 5:
+                if df[col].isin(["+", "-", "."]).all():
+                    bedtype += 1
+                else:
+                    n = num_cols - bedtype
+                    return f"bed{bedtype}+{n}"
+            elif 6 <= col <= 8:
+                if df[col].dtype == "int" and (df[col] >= 0).all():
+                    bedtype += 1
+                else:
+                    n = num_cols - bedtype
+                    return f"bed{bedtype}+{n}"
+            elif col == 9:
+                if df[col].dtype == "int":
+                    bedtype += 1
+                else:
+                    n = num_cols - bedtype
+                    return f"bed{bedtype}+{n}"
+            elif col == 10 or col == 11:
+                if df[col].str.match(r"^(\d+(,\d+)*)?$").all():
+                    bedtype += 1
+                else:
+                    n = num_cols - bedtype
+                    return f"bed{bedtype}+{n}"
+            else:
+                n = num_cols - bedtype
+                return f"bed{bedtype}+{n}"
diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py
index e8538ec..553119b 100755
--- a/bedboss/bedmaker/bedmaker.py
+++ b/bedboss/bedmaker/bedmaker.py
@@ -20,6 +20,7 @@
 from yacman.exceptions import UndefinedAliasError
 from ubiquerg import is_command_callable
 
+from bedboss.bedclassifier.bedclassifier import get_bed_type
 from bedboss.bedqc.bedqc import bedqc
 from bedboss.exceptions import RequirementsException
 
@@ -336,7 +337,7 @@ def make_bigbed(self) -> NoReturn:
         temp = os.path.join(self.output_bigbed, next(tempfile._get_candidate_names()))
 
         if not os.path.exists(big_narrow_peak):
-            bedtype = self.get_bed_type(self.output_bed)
+            bedtype = get_bed_type(self.output_bed, standard_chrom=self.standard_chrom)
             self.pm.clean_add(temp)
 
             if not is_command_callable(f"{BED_TO_BIGBED_PROGRAM}"):
@@ -455,91 +456,3 @@ def get_chrom_sizes(self) -> str:
         _LOGGER.info(f"Determined path to chrom.sizes asset: {chrom_sizes}")
 
         return chrom_sizes
-
-    def get_bed_type(self, bed: str) -> str:
-        """
-        get the bed file type (ex. bed3, bed3+n )
-        standardize chromosomes if necessary:
-        filter the input file to contain only the standard chromosomes,
-        remove regions on ChrUn chromosomes
-
-        :param bed: path to the bed file
-        :return bed type
-        """
-        #    column format for bed12
-        #    string chrom;       "Reference sequence chromosome or scaffold"
-        #    uint   chromStart;  "Start position in chromosome"
-        #    uint   chromEnd;    "End position in chromosome"
-        #    string name;        "Name of item."
-        #    uint score;          "Score (0-1000)"
-        #    char[1] strand;     "+ or - for strand"
-        #    uint thickStart;   "Start of where display should be thick (start codon)"
-        #    uint thickEnd;     "End of where display should be thick (stop codon)"
-        #    uint reserved;     "Used as itemRgb as of 2004-11-22"
-        #    int blockCount;    "Number of blocks"
-        #    int[blockCount] blockSizes; "Comma separated list of block sizes"
-        #    int[blockCount] chromStarts; "Start positions relative to chromStart"
-        df = pd.read_csv(bed, sep="\t", header=None)
-        df = df.dropna(axis=1)
-
-        # standardizing chromosome
-        # remove regions on ChrUn chromosomes
-        if self.standard_chrom:
-            _LOGGER.info("Standardizing chromosomes...")
-            df = df[df.loc[:, 0].isin(STANDARD_CHROM_LIST)]
-            df.to_csv(bed, compression="gzip", sep="\t", header=False, index=False)
-
-        num_cols = len(df.columns)
-        bedtype = 0
-        for col in df:
-            if col <= 2:
-                if col == 0:
-                    if df[col].dtype == "O":
-                        bedtype += 1
-                    else:
-                        return None
-                else:
-                    if df[col].dtype == "int" and (df[col] >= 0).all():
-                        bedtype += 1
-                    else:
-                        return None
-            else:
-                if col == 3:
-                    if df[col].dtype == "O":
-                        bedtype += 1
-                    else:
-                        n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
-                elif col == 4:
-                    if df[col].dtype == "int" and df[col].between(0, 1000).all():
-                        bedtype += 1
-                    else:
-                        n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
-                elif col == 5:
-                    if df[col].isin(["+", "-", "."]).all():
-                        bedtype += 1
-                    else:
-                        n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
-                elif 6 <= col <= 8:
-                    if df[col].dtype == "int" and (df[col] >= 0).all():
-                        bedtype += 1
-                    else:
-                        n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
-                elif col == 9:
-                    if df[col].dtype == "int":
-                        bedtype += 1
-                    else:
-                        n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
-                elif col == 10 or col == 11:
-                    if df[col].str.match(r"^(\d+(,\d+)*)?$").all():
-                        bedtype += 1
-                    else:
-                        n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
-                else:
-                    n = num_cols - bedtype
-                    return f"bed{bedtype}+{n}"

From db7b4bcc5ee970dddb328420c203395628121615 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Mon, 12 Feb 2024 11:29:38 -0500
Subject: [PATCH 02/13] Handle .gz files, add basic test
 https://github.com/databio/bedbase/issues/55

---
 bedboss/bedclassifier/__init__.py      |  1 +
 bedboss/bedclassifier/bedclassifier.py | 16 +++++++++++-----
 test/test_bedclassifier.py             | 14 ++++++++++++++
 3 files changed, 26 insertions(+), 5 deletions(-)
 create mode 100644 test/test_bedclassifier.py

diff --git a/bedboss/bedclassifier/__init__.py b/bedboss/bedclassifier/__init__.py
index e69de29..7c1629d 100644
--- a/bedboss/bedclassifier/__init__.py
+++ b/bedboss/bedclassifier/__init__.py
@@ -0,0 +1 @@
+from bedboss.bedclassifier.bedclassifier import BedClassifier
diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
index 2fe8bcc..fbf9781 100644
--- a/bedboss/bedclassifier/bedclassifier.py
+++ b/bedboss/bedclassifier/bedclassifier.py
@@ -38,25 +38,31 @@ def __init__(
         self.input_type = input_type
 
         self.abs_bed_path = os.path.abspath(self.input_file)
-        self.file_name = os.path.basename(self.abs_bed_path)
-        self.file_extension = os.path.splitext(self.abs_bed_path)[0]
+        self.file_name = os.path.splitext(os.path.basename(self.abs_bed_path))[0]
+        self.file_extension = os.path.splitext(self.abs_bed_path)[-1]
 
         # we need this only if unzipping a file
         self.output_dir = output_dir or os.path.join(
-            os.path.dirname(self.abs_bed_path) + "temp_processing"
+            os.path.dirname(self.abs_bed_path), "temp_processing"
         )
         # Use existing Pipeline Manager or Construct New one
         # Want to use Pipeline Manager to log work AND cleanup unzipped gz files.
         if pm is not None:
             self.pm = pm
         else:
-            self.logs_dir = os.path.join(os.path.dirname(self.abs_bed_path) + "logs")
+            self.logs_dir = os.path.join(self.output_dir, "logs")
             self.pm = pypiper.PipelineManager(
                 name="bedclassifier", outfolder=self.logs_dir, recover=True
             )
 
         if self.file_extension == ".gz":
-            unzipped_input_file = os.path.join(self.output_dir, self.file_name)
+            if ".bed" not in self.file_name:
+                unzipped_input_file = os.path.join(
+                    self.output_dir, self.file_name + ".bed"
+                )
+            else:
+                unzipped_input_file = os.path.join(self.output_dir, self.file_name)
+
             with gzip.open(self.input_file, "rb") as f_in:
                 with open(unzipped_input_file, "wb") as f_out:
                     shutil.copyfileobj(f_in, f_out)
diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py
new file mode 100644
index 0000000..75aadc3
--- /dev/null
+++ b/test/test_bedclassifier.py
@@ -0,0 +1,14 @@
+import os
+from tempfile import TemporaryDirectory
+
+from bedboss.bedclassifier import BedClassifier
+
+
+FILE_DIR = os.path.dirname(os.path.realpath(__file__))
+HG19_CORRECT_DIR = os.path.join(FILE_DIR, "test_data", "bed", "hg19", "correct")
+FILE_PATH = f"{HG19_CORRECT_DIR}/sample1.bed.gz"
+
+
+def test_classification():
+    with TemporaryDirectory() as d:
+        bedclass = BedClassifier(input_file=FILE_PATH, output_dir=d)

From ee00b15479a98d1d9ef83d8c078d1c98ac78346a Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Mon, 12 Feb 2024 14:52:43 -0500
Subject: [PATCH 03/13] Add reporting results via pm.report_result, use nrows
 for performance increase https://github.com/databio/bedboss/issues/34

---
 MANIFEST.in                            |  3 ++-
 bedboss/bedclassifier/__init__.py      |  2 +-
 bedboss/bedclassifier/bedclassifier.py | 32 ++++++++++++++++++--------
 test/test_bedclassifier.py             | 17 +++++++++++++-
 4 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 5520e14..f709b94 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -7,4 +7,5 @@ include bedboss/bedmaker/*
 include bedboss/bedqc/*
 include bedboss/qdrant_index/*
 include bedboss/bedbuncher/*
-include bedboss/bedbuncher/tools/*
\ No newline at end of file
+include bedboss/bedbuncher/tools/*
+include bedboss/bedclassifier/*
\ No newline at end of file
diff --git a/bedboss/bedclassifier/__init__.py b/bedboss/bedclassifier/__init__.py
index 7c1629d..b8eb0d5 100644
--- a/bedboss/bedclassifier/__init__.py
+++ b/bedboss/bedclassifier/__init__.py
@@ -1 +1 @@
-from bedboss.bedclassifier.bedclassifier import BedClassifier
+from bedboss.bedclassifier.bedclassifier import BedClassifier, get_bed_type
diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
index fbf9781..75c0284 100644
--- a/bedboss/bedclassifier/bedclassifier.py
+++ b/bedboss/bedclassifier/bedclassifier.py
@@ -2,7 +2,7 @@
 import logging
 import os
 import shutil
-from typing import Optional
+from typing import Optional, Union
 
 import pypiper
 import pandas as pd
@@ -49,11 +49,17 @@ def __init__(
         # Want to use Pipeline Manager to log work AND cleanup unzipped gz files.
         if pm is not None:
             self.pm = pm
+            self.pm_created = False
         else:
             self.logs_dir = os.path.join(self.output_dir, "logs")
             self.pm = pypiper.PipelineManager(
-                name="bedclassifier", outfolder=self.logs_dir, recover=True
+                name="bedclassifier",
+                outfolder=self.logs_dir,
+                recover=True,
+                pipestat_sample_name=bed_digest,
             )
+            self.pm.start_pipeline()
+            self.pm_created = True
 
         if self.file_extension == ".gz":
             if ".bed" not in self.file_name:
@@ -64,24 +70,29 @@ def __init__(
                 unzipped_input_file = os.path.join(self.output_dir, self.file_name)
 
             with gzip.open(self.input_file, "rb") as f_in:
+                _LOGGER.info(
+                    f"Unzipping file:{self.input_file} and Creating Unzipped file: {unzipped_input_file}"
+                )
                 with open(unzipped_input_file, "wb") as f_out:
                     shutil.copyfileobj(f_in, f_out)
             self.input_file = unzipped_input_file
             self.pm.clean_add(unzipped_input_file)
 
-        bed_type = get_bed_type(self.input_file)
+        self.bed_type = get_bed_type(self.input_file)
 
         if self.input_type is not None:
-            if bed_type != self.input_type:
+            if self.bed_type != self.input_type:
                 _LOGGER.warning(
-                    f"BED file classified as different type than given input: {bed_type} vs {self.input_type}"
+                    f"BED file classified as different type than given input: {self.bed_type} vs {self.input_type}"
                 )
 
-        else:
-            self.input_file = bed_type
+        self.pm.report_result(key="bedtype", value=self.bed_type)
+
+        if self.pm_created is True:
+            self.pm.stop_pipeline()
 
 
-def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> str:
+def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> Union[str, None]:
     """
     get the bed file type (ex. bed3, bed3+n )
     standardize chromosomes if necessary:
@@ -106,8 +117,9 @@ def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> str:
     #    int[blockCount] blockSizes; "Comma separated list of block sizes"
     #    int[blockCount] chromStarts; "Start positions relative to chromStart"
 
-    # Use chunksize to read only a few lines of the BED file (We don't need all of it)
-    df = pd.read_csv(bed, sep="\t", header=None, chunksize=4)
+    # Use nrows to read only a few lines of the BED file (We don't need all of it)
+    df = pd.read_csv(bed, sep="\t", header=None, nrows=4)
+    print(df)
     df = df.dropna(axis=1)
 
     # standardizing chromosome
diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py
index 75aadc3..63ecb1e 100644
--- a/test/test_bedclassifier.py
+++ b/test/test_bedclassifier.py
@@ -1,14 +1,29 @@
 import os
 from tempfile import TemporaryDirectory
 
-from bedboss.bedclassifier import BedClassifier
+from bedboss.bedclassifier import BedClassifier, get_bed_type
 
 
 FILE_DIR = os.path.dirname(os.path.realpath(__file__))
 HG19_CORRECT_DIR = os.path.join(FILE_DIR, "test_data", "bed", "hg19", "correct")
 FILE_PATH = f"{HG19_CORRECT_DIR}/sample1.bed.gz"
+FILE_PATH_UNZIPPED = f"{HG19_CORRECT_DIR}/hg19_example1.bed"
 
 
 def test_classification():
     with TemporaryDirectory() as d:
         bedclass = BedClassifier(input_file=FILE_PATH, output_dir=d)
+        print("DEBUG BEDCLASS\n")
+        print(bedclass.bed_type)
+
+
+def test_get_bed_type():
+    bedtype = get_bed_type(bed=FILE_PATH_UNZIPPED)
+    print("DEBUG BEDTYPE\n")
+    print(bedtype)
+
+
+if __name__ == "__main__":
+    print("DEBUG FROM MAIN")
+    test_get_bed_type()
+    test_classification()

From 4ba8f752a01420876c49620b98f1df6fadda4835 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Tue, 13 Feb 2024 10:30:02 -0500
Subject: [PATCH 04/13] Add error handling when reading csv, defualt to
 "unknown_bedtype" https://github.com/databio/bedboss/issues/34

---
 bedboss/bedclassifier/bedclassifier.py | 142 +++++++++++++------------
 test/test_bedclassifier.py             |  19 ++++
 2 files changed, 94 insertions(+), 67 deletions(-)

diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
index 75c0284..c9827a6 100644
--- a/bedboss/bedclassifier/bedclassifier.py
+++ b/bedboss/bedclassifier/bedclassifier.py
@@ -4,6 +4,7 @@
 import shutil
 from typing import Optional, Union
 
+import pandas.errors
 import pypiper
 import pandas as pd
 
@@ -62,12 +63,12 @@ def __init__(
             self.pm_created = True
 
         if self.file_extension == ".gz":
-            if ".bed" not in self.file_name:
-                unzipped_input_file = os.path.join(
-                    self.output_dir, self.file_name + ".bed"
-                )
-            else:
-                unzipped_input_file = os.path.join(self.output_dir, self.file_name)
+            # if ".bed" not in self.file_name:
+            #     unzipped_input_file = os.path.join(
+            #         self.output_dir, self.file_name + ".bed"
+            #     )
+            # else:
+            unzipped_input_file = os.path.join(self.output_dir, self.file_name)
 
             with gzip.open(self.input_file, "rb") as f_in:
                 _LOGGER.info(
@@ -118,70 +119,77 @@ def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> Union[str, N
     #    int[blockCount] chromStarts; "Start positions relative to chromStart"
 
     # Use nrows to read only a few lines of the BED file (We don't need all of it)
-    df = pd.read_csv(bed, sep="\t", header=None, nrows=4)
+    df = None
+    try:
+        df = pd.read_csv(bed, sep="\t", header=None, nrows=4)
+    except pandas.errors.ParserError as e:
+        _LOGGER.warning(f"Unable to parse bed file {bed}, setting bed_type = Unknown")
     print(df)
-    df = df.dropna(axis=1)
-
-    # standardizing chromosome
-    # remove regions on ChrUn chromosomes
-    if standard_chrom:
-        _LOGGER.info("Standardizing chromosomes...")
-        df = df[df.loc[:, 0].isin(STANDARD_CHROM_LIST)]
-        df.to_csv(bed, compression="gzip", sep="\t", header=False, index=False)
-
-    num_cols = len(df.columns)
-    bedtype = 0
-
-    # TODO add logic for narrow and broadpeak
-    for col in df:
-        if col <= 2:
-            if col == 0:
-                if df[col].dtype == "O":
-                    bedtype += 1
+    if df is not None:
+        df = df.dropna(axis=1)
+
+        # standardizing chromosome
+        # remove regions on ChrUn chromosomes
+        if standard_chrom:
+            _LOGGER.info("Standardizing chromosomes...")
+            df = df[df.loc[:, 0].isin(STANDARD_CHROM_LIST)]
+            df.to_csv(bed, compression="gzip", sep="\t", header=False, index=False)
+
+        num_cols = len(df.columns)
+        bedtype = 0
+
+        # TODO add logic for narrow and broadpeak
+        for col in df:
+            if col <= 2:
+                if col == 0:
+                    if df[col].dtype == "O":
+                        bedtype += 1
+                    else:
+                        return "unknown_bedtype"
                 else:
-                    return None
+                    if df[col].dtype == "int" and (df[col] >= 0).all():
+                        bedtype += 1
+                    else:
+                        return "unknown_bedtype"
             else:
-                if df[col].dtype == "int" and (df[col] >= 0).all():
-                    bedtype += 1
-                else:
-                    return None
-        else:
-            if col == 3:
-                if df[col].dtype == "O":
-                    bedtype += 1
-                else:
-                    n = num_cols - bedtype
-                    return f"bed{bedtype}+{n}"
-            elif col == 4:
-                if df[col].dtype == "int" and df[col].between(0, 1000).all():
-                    bedtype += 1
-                else:
-                    n = num_cols - bedtype
-                    return f"bed{bedtype}+{n}"
-            elif col == 5:
-                if df[col].isin(["+", "-", "."]).all():
-                    bedtype += 1
+                if col == 3:
+                    if df[col].dtype == "O":
+                        bedtype += 1
+                    else:
+                        n = num_cols - bedtype
+                        return f"bed{bedtype}+{n}"
+                elif col == 4:
+                    if df[col].dtype == "int" and df[col].between(0, 1000).all():
+                        bedtype += 1
+                    else:
+                        n = num_cols - bedtype
+                        return f"bed{bedtype}+{n}"
+                elif col == 5:
+                    if df[col].isin(["+", "-", "."]).all():
+                        bedtype += 1
+                    else:
+                        n = num_cols - bedtype
+                        return f"bed{bedtype}+{n}"
+                elif 6 <= col <= 8:
+                    if df[col].dtype == "int" and (df[col] >= 0).all():
+                        bedtype += 1
+                    else:
+                        n = num_cols - bedtype
+                        return f"bed{bedtype}+{n}"
+                elif col == 9:
+                    if df[col].dtype == "int":
+                        bedtype += 1
+                    else:
+                        n = num_cols - bedtype
+                        return f"bed{bedtype}+{n}"
+                elif col == 10 or col == 11:
+                    if df[col].str.match(r"^(\d+(,\d+)*)?$").all():
+                        bedtype += 1
+                    else:
+                        n = num_cols - bedtype
+                        return f"bed{bedtype}+{n}"
                 else:
                     n = num_cols - bedtype
                     return f"bed{bedtype}+{n}"
-            elif 6 <= col <= 8:
-                if df[col].dtype == "int" and (df[col] >= 0).all():
-                    bedtype += 1
-                else:
-                    n = num_cols - bedtype
-                    return f"bed{bedtype}+{n}"
-            elif col == 9:
-                if df[col].dtype == "int":
-                    bedtype += 1
-                else:
-                    n = num_cols - bedtype
-                    return f"bed{bedtype}+{n}"
-            elif col == 10 or col == 11:
-                if df[col].str.match(r"^(\d+(,\d+)*)?$").all():
-                    bedtype += 1
-                else:
-                    n = num_cols - bedtype
-                    return f"bed{bedtype}+{n}"
-            else:
-                n = num_cols - bedtype
-                return f"bed{bedtype}+{n}"
+    else:
+        return "unknown_bedtype"
diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py
index 63ecb1e..5d06fd8 100644
--- a/test/test_bedclassifier.py
+++ b/test/test_bedclassifier.py
@@ -23,7 +23,26 @@ def test_get_bed_type():
     print(bedtype)
 
 
+def test_manual_dir_beds():
+    """This test is currently just for local manual testing"""
+    local_dir = "/home/drc/Downloads/test_beds_BED_classifier/"
+    output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/"
+
+    for root, dirs, files in os.walk(local_dir):
+        for file in files:
+            print(file)
+            file_path = os.path.join(root, file)
+            print(file_path)
+            bedclass = BedClassifier(
+                input_file=file_path, output_dir=output_dir, bed_digest=file
+            )
+            print("\nDEBUG BEDCLASS\n")
+            print(bedclass.bed_type)
+            print("+++++++++++++++++++")
+
+
 if __name__ == "__main__":
     print("DEBUG FROM MAIN")
     test_get_bed_type()
     test_classification()
+    test_manual_dir_beds()

From 55d3b8867eae04c68ab79491960d7bd34b5c21df Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Tue, 13 Feb 2024 11:54:01 -0500
Subject: [PATCH 05/13] Add better exception handling and allowing for
 integer/float chromosomes in column 0
 https://github.com/databio/bedboss/issues/34

---
 bedboss/bedclassifier/bedclassifier.py | 41 +++++++++++++++++++++++---
 bedboss/exceptions.py                  | 13 ++++++++
 test/test_bedclassifier.py             |  9 ++++--
 3 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
index c9827a6..2388238 100644
--- a/bedboss/bedclassifier/bedclassifier.py
+++ b/bedboss/bedclassifier/bedclassifier.py
@@ -9,6 +9,7 @@
 import pandas as pd
 
 from bedboss.const import STANDARD_CHROM_LIST
+from bedboss.exceptions import BedTypeException
 
 _LOGGER = logging.getLogger("bedboss")
 
@@ -93,7 +94,9 @@ def __init__(
             self.pm.stop_pipeline()
 
 
-def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> Union[str, None]:
+def get_bed_type(
+    bed: str, standard_chrom: Optional[str] = None, no_fail: Optional[bool] = True
+) -> Union[str, None]:
     """
     get the bed file type (ex. bed3, bed3+n )
     standardize chromosomes if necessary:
@@ -119,11 +122,22 @@ def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> Union[str, N
     #    int[blockCount] chromStarts; "Start positions relative to chromStart"
 
     # Use nrows to read only a few lines of the BED file (We don't need all of it)
+
     df = None
+
     try:
         df = pd.read_csv(bed, sep="\t", header=None, nrows=4)
     except pandas.errors.ParserError as e:
-        _LOGGER.warning(f"Unable to parse bed file {bed}, setting bed_type = Unknown")
+        if no_fail:
+            _LOGGER.warning(
+                f"Unable to parse bed file {bed}, setting bed_type = Unknown"
+            )
+            return "unknown_bedtype"
+        else:
+            raise BedTypeException(
+                reason=f"Bed type could not be determined due to CSV parse error {e}"
+            )
+
     print(df)
     if df is not None:
         df = df.dropna(axis=1)
@@ -144,13 +158,32 @@ def get_bed_type(bed: str, standard_chrom: Optional[str] = None) -> Union[str, N
                 if col == 0:
                     if df[col].dtype == "O":
                         bedtype += 1
+                    elif df[col].dtype == "int" or df[col].dtype == "float":
+                        bedtype += 1
                     else:
-                        return "unknown_bedtype"
+                        if no_fail:
+                            _LOGGER.warning(
+                                f"Bed type could not be determined at column 0 with data type: {df[col].dtype}"
+                            )
+                            return "unknown_bedtype"
+                        else:
+                            raise BedTypeException(
+                                reason=f"Bed type could not be determined at column {0} with data type: {df[col].dtype}"
+                            )
+
                 else:
                     if df[col].dtype == "int" and (df[col] >= 0).all():
                         bedtype += 1
                     else:
-                        return "unknown_bedtype"
+                        if no_fail:
+                            _LOGGER.warning(
+                                f"Bed type could not be determined at column {col} with data type: {df[col].dtype}"
+                            )
+                            return "unknown_bedtype"
+                        else:
+                            raise BedTypeException(
+                                reason=f"Bed type could not be determined at column 0 with data type: {df[col].dtype}"
+                            )
             else:
                 if col == 3:
                     if df[col].dtype == "O":
diff --git a/bedboss/exceptions.py b/bedboss/exceptions.py
index d84d06d..afd6f03 100644
--- a/bedboss/exceptions.py
+++ b/bedboss/exceptions.py
@@ -46,3 +46,16 @@ def __init__(self, reason: str = ""):
         :param str reason: additional info about requirements exception
         """
         super(RequirementsException, self).__init__(reason)
+
+
+class BedTypeException(BedBossException):
+    """Exception when Bed Type could not be determined."""
+
+    def __init__(self, reason: str = ""):
+        """
+        Optionally provide explanation for exceptional condition.
+
+        :param str reason: some context why error occurred while
+        using Open Signal Matrix
+        """
+        super(BedTypeException, self).__init__(reason)
diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py
index 5d06fd8..0125284 100644
--- a/test/test_bedclassifier.py
+++ b/test/test_bedclassifier.py
@@ -26,6 +26,7 @@ def test_get_bed_type():
 def test_manual_dir_beds():
     """This test is currently just for local manual testing"""
     local_dir = "/home/drc/Downloads/test_beds_BED_classifier/"
+    # local_dir = "/home/drc/Downloads/individual_beds/"
     output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/"
 
     for root, dirs, files in os.walk(local_dir):
@@ -41,8 +42,12 @@ def test_manual_dir_beds():
             print("+++++++++++++++++++")
 
 
+def test_from_PEPhub_beds():
+    pass
+
+
 if __name__ == "__main__":
     print("DEBUG FROM MAIN")
-    test_get_bed_type()
-    test_classification()
+    # test_get_bed_type()
+    # test_classification()
     test_manual_dir_beds()

From 3d3ef5da91451afc2792a954653ec14a045a19ac Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Tue, 13 Feb 2024 12:06:47 -0500
Subject: [PATCH 06/13] Fix returns, and grouped exceptions

---
 bedboss/bedclassifier/bedclassifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
index 2388238..d1518b4 100644
--- a/bedboss/bedclassifier/bedclassifier.py
+++ b/bedboss/bedclassifier/bedclassifier.py
@@ -96,7 +96,7 @@ def __init__(
 
 def get_bed_type(
     bed: str, standard_chrom: Optional[str] = None, no_fail: Optional[bool] = True
-) -> Union[str, None]:
+) -> str:
     """
     get the bed file type (ex. bed3, bed3+n )
     standardize chromosomes if necessary:
@@ -127,7 +127,7 @@ def get_bed_type(
 
     try:
         df = pd.read_csv(bed, sep="\t", header=None, nrows=4)
-    except pandas.errors.ParserError as e:
+    except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
         if no_fail:
             _LOGGER.warning(
                 f"Unable to parse bed file {bed}, setting bed_type = Unknown"

From 12b88764519afb80c6fd032758bc236da034916d Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Tue, 13 Feb 2024 12:54:01 -0500
Subject: [PATCH 07/13] add clarity to errors

---
 bedboss/bedclassifier/bedclassifier.py | 4 ++--
 test/test_bedclassifier.py             | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
index d1518b4..f6a0e5c 100644
--- a/bedboss/bedclassifier/bedclassifier.py
+++ b/bedboss/bedclassifier/bedclassifier.py
@@ -130,7 +130,7 @@ def get_bed_type(
     except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
         if no_fail:
             _LOGGER.warning(
-                f"Unable to parse bed file {bed}, setting bed_type = Unknown"
+                f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = Unknown"
             )
             return "unknown_bedtype"
         else:
@@ -163,7 +163,7 @@ def get_bed_type(
                     else:
                         if no_fail:
                             _LOGGER.warning(
-                                f"Bed type could not be determined at column 0 with data type: {df[col].dtype}"
+                                f"Bed type could not be determined at column {0} with data type: {df[col].dtype}"
                             )
                             return "unknown_bedtype"
                         else:
diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py
index 0125284..53d78b9 100644
--- a/test/test_bedclassifier.py
+++ b/test/test_bedclassifier.py
@@ -25,8 +25,8 @@ def test_get_bed_type():
 
 def test_manual_dir_beds():
     """This test is currently just for local manual testing"""
-    local_dir = "/home/drc/Downloads/test_beds_BED_classifier/"
-    # local_dir = "/home/drc/Downloads/individual_beds/"
+    # local_dir = "/home/drc/Downloads/test_beds_BED_classifier/"
+    local_dir = "/home/drc/Downloads/individual_beds/"
     output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/"
 
     for root, dirs, files in os.walk(local_dir):

From 558b1f5e8589e30d6cbdb8c59c595ab3b77154fc Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Tue, 13 Feb 2024 15:52:28 -0500
Subject: [PATCH 08/13] skip first rows of bed file if they are not in column
 format

---
 bedboss/bedclassifier/bedclassifier.py | 33 ++++++++++++++++----------
 test/test_bedclassifier.py             |  3 ++-
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
index f6a0e5c..b5f1570 100644
--- a/bedboss/bedclassifier/bedclassifier.py
+++ b/bedboss/bedclassifier/bedclassifier.py
@@ -125,18 +125,26 @@ def get_bed_type(
 
     df = None
 
-    try:
-        df = pd.read_csv(bed, sep="\t", header=None, nrows=4)
-    except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
-        if no_fail:
-            _LOGGER.warning(
-                f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = Unknown"
-            )
-            return "unknown_bedtype"
-        else:
-            raise BedTypeException(
-                reason=f"Bed type could not be determined due to CSV parse error {e}"
-            )
+    max_rows = 5
+    row_count = 0
+    while row_count <= max_rows:
+        print(f"ROW COUNT: {row_count}")
+        try:
+            df = pd.read_csv(bed, sep="\t", header=None, nrows=4, skiprows=row_count)
+            break
+        except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
+            if row_count <= max_rows:
+                row_count += 1
+            else:
+                if no_fail:
+                    _LOGGER.warning(
+                        f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype"
+                    )
+                    return "unknown_bedtype"
+                else:
+                    raise BedTypeException(
+                        reason=f"Bed type could not be determined due to CSV parse error {e}"
+                    )
 
     print(df)
     if df is not None:
@@ -152,7 +160,6 @@ def get_bed_type(
         num_cols = len(df.columns)
         bedtype = 0
 
-        # TODO add logic for narrow and broadpeak
         for col in df:
             if col <= 2:
                 if col == 0:
diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py
index 53d78b9..41b377a 100644
--- a/test/test_bedclassifier.py
+++ b/test/test_bedclassifier.py
@@ -26,7 +26,8 @@ def test_get_bed_type():
 def test_manual_dir_beds():
     """This test is currently just for local manual testing"""
     # local_dir = "/home/drc/Downloads/test_beds_BED_classifier/"
-    local_dir = "/home/drc/Downloads/individual_beds/"
+    # local_dir = "/home/drc/Downloads/individual_beds/"
+    local_dir = "/home/drc/Downloads/only_narrowpeaks/"
     output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/"
 
     for root, dirs, files in os.walk(local_dir):

From efaf08333c7657513d9e7595628e1f35a5bad5bb Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Tue, 13 Feb 2024 16:13:11 -0500
Subject: [PATCH 09/13] add simple narrowPeak and broadPeak logic for
 classification

---
 bedboss/bedclassifier/bedclassifier.py | 14 +++++++++-----
 test/test_bedclassifier.py             |  2 ++
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
index b5f1570..d62faea 100644
--- a/bedboss/bedclassifier/bedclassifier.py
+++ b/bedboss/bedclassifier/bedclassifier.py
@@ -104,6 +104,7 @@ def get_bed_type(
     remove regions on ChrUn chromosomes
 
     :param bed: path to the bed file
+    :param no_fail: should the function (and pipeline) continue if this function fails to parse BED file
     :param standard_chrom:
     :return bed type
     """
@@ -121,8 +122,6 @@ def get_bed_type(
     #    int[blockCount] blockSizes; "Comma separated list of block sizes"
     #    int[blockCount] chromStarts; "Start positions relative to chromStart"
 
-    # Use nrows to read only a few lines of the BED file (We don't need all of it)
-
     df = None
 
     max_rows = 5
@@ -146,7 +145,6 @@ def get_bed_type(
                         reason=f"Bed type could not be determined due to CSV parse error {e}"
                     )
 
-    print(df)
     if df is not None:
         df = df.dropna(axis=1)
 
@@ -221,13 +219,19 @@ def get_bed_type(
                         bedtype += 1
                     else:
                         n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
+                        if "broadpeak" in bed or "broadPeak" in bed:
+                            return f"broadPeak,bed{bedtype}+{n}"
+                        else:
+                            return f"bed{bedtype}+{n}"
                 elif col == 10 or col == 11:
                     if df[col].str.match(r"^(\d+(,\d+)*)?$").all():
                         bedtype += 1
                     else:
                         n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
+                        if "narrowpeak" in bed or "narrowPeak" in bed:
+                            return f"narrowPeak,bed{bedtype}+{n}"
+                        else:
+                            return f"bed{bedtype}+{n}"
                 else:
                     n = num_cols - bedtype
                     return f"bed{bedtype}+{n}"
diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py
index 41b377a..2d1db18 100644
--- a/test/test_bedclassifier.py
+++ b/test/test_bedclassifier.py
@@ -44,6 +44,8 @@ def test_manual_dir_beds():
 
 
 def test_from_PEPhub_beds():
+    """"""
+    # TODO implement testing from pephub
     pass
 
 

From 09a6405812287e911320efdfa529cb4a867f8e93 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Tue, 13 Feb 2024 16:27:30 -0500
Subject: [PATCH 10/13] remove unused code

---
 bedboss/bedclassifier/bedclassifier.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
index d62faea..f08189f 100644
--- a/bedboss/bedclassifier/bedclassifier.py
+++ b/bedboss/bedclassifier/bedclassifier.py
@@ -64,11 +64,6 @@ def __init__(
             self.pm_created = True
 
         if self.file_extension == ".gz":
-            # if ".bed" not in self.file_name:
-            #     unzipped_input_file = os.path.join(
-            #         self.output_dir, self.file_name + ".bed"
-            #     )
-            # else:
             unzipped_input_file = os.path.join(self.output_dir, self.file_name)
 
             with gzip.open(self.input_file, "rb") as f_in:

From f5333a34bfada63b4623cb8832985bc821579e7e Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Tue, 13 Feb 2024 16:29:07 -0500
Subject: [PATCH 11/13] comment out manual test

---
 test/test_bedclassifier.py | 42 +++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py
index 2d1db18..c5fde95 100644
--- a/test/test_bedclassifier.py
+++ b/test/test_bedclassifier.py
@@ -23,24 +23,24 @@ def test_get_bed_type():
     print(bedtype)
 
 
-def test_manual_dir_beds():
-    """This test is currently just for local manual testing"""
-    # local_dir = "/home/drc/Downloads/test_beds_BED_classifier/"
-    # local_dir = "/home/drc/Downloads/individual_beds/"
-    local_dir = "/home/drc/Downloads/only_narrowpeaks/"
-    output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/"
-
-    for root, dirs, files in os.walk(local_dir):
-        for file in files:
-            print(file)
-            file_path = os.path.join(root, file)
-            print(file_path)
-            bedclass = BedClassifier(
-                input_file=file_path, output_dir=output_dir, bed_digest=file
-            )
-            print("\nDEBUG BEDCLASS\n")
-            print(bedclass.bed_type)
-            print("+++++++++++++++++++")
+# def test_manual_dir_beds():
+#     """This test is currently just for local manual testing"""
+#     # local_dir = "/home/drc/Downloads/test_beds_BED_classifier/"
+#     # local_dir = "/home/drc/Downloads/individual_beds/"
+#     local_dir = "/home/drc/Downloads/only_narrowpeaks/"
+#     output_dir = "/home/drc/Downloads/BED_CLASSIFIER_OUTPUT/"
+#
+#     for root, dirs, files in os.walk(local_dir):
+#         for file in files:
+#             print(file)
+#             file_path = os.path.join(root, file)
+#             print(file_path)
+#             bedclass = BedClassifier(
+#                 input_file=file_path, output_dir=output_dir, bed_digest=file
+#             )
+#             print("\nDEBUG BEDCLASS\n")
+#             print(bedclass.bed_type)
+#             print("+++++++++++++++++++")
 
 
 def test_from_PEPhub_beds():
@@ -51,6 +51,6 @@ def test_from_PEPhub_beds():
 
 if __name__ == "__main__":
     print("DEBUG FROM MAIN")
-    # test_get_bed_type()
-    # test_classification()
-    test_manual_dir_beds()
+    test_get_bed_type()
+    test_classification()
+    # test_manual_dir_beds()

From e968fad5abbb94c2612e8a70894cd3314a3a4aa5 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Wed, 14 Feb 2024 10:07:00 -0500
Subject: [PATCH 12/13] comment out main call for manual test, add pytest
 skipping for tests

---
 test/test_bedclassifier.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py
index c5fde95..1c22fc8 100644
--- a/test/test_bedclassifier.py
+++ b/test/test_bedclassifier.py
@@ -1,4 +1,5 @@
 import os
+import pytest
 from tempfile import TemporaryDirectory
 
 from bedboss.bedclassifier import BedClassifier, get_bed_type
@@ -10,17 +11,22 @@
 FILE_PATH_UNZIPPED = f"{HG19_CORRECT_DIR}/hg19_example1.bed"
 
 
+@pytest.mark.skip(reason="Illegal seek during teardown.")
 def test_classification():
     with TemporaryDirectory() as d:
         bedclass = BedClassifier(input_file=FILE_PATH, output_dir=d)
-        print("DEBUG BEDCLASS\n")
-        print(bedclass.bed_type)
 
 
 def test_get_bed_type():
     bedtype = get_bed_type(bed=FILE_PATH_UNZIPPED)
-    print("DEBUG BEDTYPE\n")
-    print(bedtype)
+    assert bedtype == "bed6+3"
+
+
+@pytest.mark.skip(reason="Not implemented")
+def test_from_PEPhub_beds():
+    """"""
+    # TODO implement testing from pephub
+    pass
 
 
 # def test_manual_dir_beds():
@@ -43,14 +49,7 @@ def test_get_bed_type():
 #             print("+++++++++++++++++++")
 
 
-def test_from_PEPhub_beds():
-    """"""
-    # TODO implement testing from pephub
-    pass
-
-
-if __name__ == "__main__":
-    print("DEBUG FROM MAIN")
-    test_get_bed_type()
-    test_classification()
-    # test_manual_dir_beds()
+# if __name__ == "__main__":
+#     test_get_bed_type()
+#     test_classification()
+# test_manual_dir_beds()

From 5db459768766ceb7aaa14b3d987eb65401b8605a Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Wed, 14 Feb 2024 12:44:59 -0500
Subject: [PATCH 13/13] add returning tuple when classifying, e.g.
 (f"bed{bedtype}+{n}", "broadpeak")

---
 bedboss/bedclassifier/bedclassifier.py | 32 +++++++++++++-------------
 test/test_bedclassifier.py             |  2 +-
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
index f08189f..4251b05 100644
--- a/bedboss/bedclassifier/bedclassifier.py
+++ b/bedboss/bedclassifier/bedclassifier.py
@@ -2,7 +2,7 @@
 import logging
 import os
 import shutil
-from typing import Optional, Union
+from typing import Optional, Tuple
 
 import pandas.errors
 import pypiper
@@ -91,7 +91,7 @@ def __init__(
 
 def get_bed_type(
     bed: str, standard_chrom: Optional[str] = None, no_fail: Optional[bool] = True
-) -> str:
+) -> Tuple[str, str]:
     """
     get the bed file type (ex. bed3, bed3+n )
     standardize chromosomes if necessary:
@@ -101,7 +101,7 @@ def get_bed_type(
     :param bed: path to the bed file
     :param no_fail: should the function (and pipeline) continue if this function fails to parse BED file
     :param standard_chrom:
-    :return bed type
+    :return bedtype: tuple[option ["bed{bedtype}+{n}", "unknown_bedtype"], option [bed, narrowpeak, broadpeak, unknown_bedtype]]
     """
     #    column format for bed12
     #    string chrom;       "Reference sequence chromosome or scaffold"
@@ -134,7 +134,7 @@ def get_bed_type(
                     _LOGGER.warning(
                         f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype"
                     )
-                    return "unknown_bedtype"
+                    return ("unknown_bedtype", "unknown_bedtype")
                 else:
                     raise BedTypeException(
                         reason=f"Bed type could not be determined due to CSV parse error {e}"
@@ -165,7 +165,7 @@ def get_bed_type(
                             _LOGGER.warning(
                                 f"Bed type could not be determined at column {0} with data type: {df[col].dtype}"
                             )
-                            return "unknown_bedtype"
+                            return ("unknown_bedtype", "unknown_bedtype")
                         else:
                             raise BedTypeException(
                                 reason=f"Bed type could not be determined at column {0} with data type: {df[col].dtype}"
@@ -179,7 +179,7 @@ def get_bed_type(
                             _LOGGER.warning(
                                 f"Bed type could not be determined at column {col} with data type: {df[col].dtype}"
                             )
-                            return "unknown_bedtype"
+                            return ("unknown_bedtype", "unknown_bedtype")
                         else:
                             raise BedTypeException(
                                 reason=f"Bed type could not be determined at column 0 with data type: {df[col].dtype}"
@@ -190,45 +190,45 @@ def get_bed_type(
                         bedtype += 1
                     else:
                         n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
+                        return (f"bed{bedtype}+{n}", "bed")
                 elif col == 4:
                     if df[col].dtype == "int" and df[col].between(0, 1000).all():
                         bedtype += 1
                     else:
                         n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
+                        return (f"bed{bedtype}+{n}", "bed")
                 elif col == 5:
                     if df[col].isin(["+", "-", "."]).all():
                         bedtype += 1
                     else:
                         n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
+                        return (f"bed{bedtype}+{n}", "bed")
                 elif 6 <= col <= 8:
                     if df[col].dtype == "int" and (df[col] >= 0).all():
                         bedtype += 1
                     else:
                         n = num_cols - bedtype
-                        return f"bed{bedtype}+{n}"
+                        return (f"bed{bedtype}+{n}", "bed")
                 elif col == 9:
                     if df[col].dtype == "int":
                         bedtype += 1
                     else:
                         n = num_cols - bedtype
                         if "broadpeak" in bed or "broadPeak" in bed:
-                            return f"broadPeak,bed{bedtype}+{n}"
+                            return (f"bed{bedtype}+{n}", "broadpeak")
                         else:
-                            return f"bed{bedtype}+{n}"
+                            return (f"bed{bedtype}+{n}", "bed")
                 elif col == 10 or col == 11:
                     if df[col].str.match(r"^(\d+(,\d+)*)?$").all():
                         bedtype += 1
                     else:
                         n = num_cols - bedtype
                         if "narrowpeak" in bed or "narrowPeak" in bed:
-                            return f"narrowPeak,bed{bedtype}+{n}"
+                            return (f"bed{bedtype}+{n}", "narrowpeak")
                         else:
-                            return f"bed{bedtype}+{n}"
+                            return (f"bed{bedtype}+{n}", "bed")
                 else:
                     n = num_cols - bedtype
-                    return f"bed{bedtype}+{n}"
+                    return (f"bed{bedtype}+{n}", "bed")
     else:
-        return "unknown_bedtype"
+        return ("unknown_bedtype", "unknown_bedtype")
diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py
index 1c22fc8..aac980e 100644
--- a/test/test_bedclassifier.py
+++ b/test/test_bedclassifier.py
@@ -19,7 +19,7 @@ def test_classification():
 
 def test_get_bed_type():
     bedtype = get_bed_type(bed=FILE_PATH_UNZIPPED)
-    assert bedtype == "bed6+3"
+    assert bedtype == ("bed6+3", "bed")
 
 
 @pytest.mark.skip(reason="Not implemented")