Merge remote-tracking branch 'origin/dev' into dev

databio · Jun 3, 2024 · 040ddac · 040ddac
2 parents 07b1ba9 + 7cb35e1
commit 040ddac
Show file tree

Hide file tree

Showing 7 changed files with 252 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -139,4 +139,15 @@ openSignalMatrix
 out2023/*
 
 # test data
-test/test_data/*
+test/test_data/*
+/scripts/bedclassifier_tuning/results/
+/scripts/bedclassifier_tuning/data/
+genome_config.yaml
+alias/hg19/fasta/default/hg19.chrom.sizes
+alias/hg19/fasta/default/hg19.fa
+alias/hg19/fasta/default/hg19.fa.fai
+data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c__ASDs.json
+data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.chrom.sizes
+data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa
+data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa.fai
+test/Untitled.ipynb
diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
@@ -38,12 +38,39 @@ def get_bed_type(bed: str, no_fail: Optional[bool] = True) -> Tuple[str, str]:
 
     max_rows = 5
     row_count = 0
+
     while row_count <= max_rows:
         try:
             df = pd.read_csv(bed, sep="\t", header=None, nrows=4, skiprows=row_count)
             if row_count > 0:
                 _LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
             break
+        except UnicodeDecodeError as e:
+            try:
+                df = pd.read_csv(
+                    bed,
+                    sep="\t",
+                    header=None,
+                    nrows=4,
+                    skiprows=row_count,
+                    encoding="utf-16",
+                )
+                if row_count > 0:
+                    _LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
+                break
+            except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
+                if row_count <= max_rows:
+                    row_count += 1
+                else:
+                    if no_fail:
+                        _LOGGER.warning(
+                            f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype"
+                        )
+                        return "unknown_bedtype", "unknown_bedtype"
+                    else:
+                        raise BedTypeException(
+                            reason=f"Bed type could not be determined due to CSV parse error {e}"
+                        )
         except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
             if row_count <= max_rows:
                 row_count += 1

diff --git a/bedboss/cli.py b/bedboss/cli.py
@@ -391,7 +391,6 @@ def make_bedset(
 def init_config(
     outfolder: str = typer.Option(..., help="Path to the output folder"),
 ):
-
     from bedboss.utils import save_example_bedbase_config
 
     save_example_bedbase_config(outfolder)

diff --git a/scripts/bedclassifier_tuning/README.md b/scripts/bedclassifier_tuning/README.md
@@ -0,0 +1,2 @@
+Just making a script to download and assess BED files and determine if they are correctly classified.
+
diff --git a/scripts/bedclassifier_tuning/bedclassifier_output_schema.yaml b/scripts/bedclassifier_tuning/bedclassifier_output_schema.yaml
@@ -0,0 +1,23 @@
+title: Bed Classifier
+description: Output for bed classification results
+type: object
+properties:
+  pipeline_name: "bedclassifier"
+  samples:
+    type: object
+    properties:
+      bedfile_named:
+        type: string
+        description: "reported bedfile name e.g. narrowpeak"
+      bedfile_type:
+        type: string
+        description: "reported bedfile type"
+      given_bedfile_type:
+        type: string
+        description: "given bed file type"
+      types_match:
+        type: boolean
+        description: "Do the types match?"
+      gsm:
+        type: string
+        description: "given gsm"
diff --git a/scripts/bedclassifier_tuning/bedclassify.py b/scripts/bedclassifier_tuning/bedclassify.py
@@ -0,0 +1,188 @@
+import gzip
+import logging
+import os
+import shutil
+
+import pipestat
+import pypiper
+from typing import Optional
+
+from bedboss.bedclassifier import get_bed_type
+from bedboss.exceptions import BedTypeException
+
+_LOGGER = logging.getLogger("bedboss")
+
+from geofetch import Finder, Geofetcher
+
+
+class BedClassifier:
+    """
+    This will take the input of either a .bed or a .bed.gz and classify the type of BED file.
+
+    """
+
+    def __init__(
+        self,
+        input_file: str,
+        output_dir: Optional[str] = None,
+        bed_digest: Optional[str] = None,
+        input_type: Optional[str] = None,
+        pm: pypiper.PipelineManager = None,
+        report_to_database: Optional[bool] = False,
+        psm: pipestat.PipestatManager = None,
+        gsm: str = None,
+    ):
+        # Raise Exception if input_type is given and it is NOT a BED file
+        # Raise Exception if the input file cannot be resolved
+
+        self.gsm = gsm
+        self.input_file = input_file
+        self.bed_digest = bed_digest
+        self.input_type = input_type
+
+        self.abs_bed_path = os.path.abspath(self.input_file)
+        self.file_name = os.path.splitext(os.path.basename(self.abs_bed_path))[0]
+        self.file_extension = os.path.splitext(self.abs_bed_path)[-1]
+
+        # we need this only if unzipping a file
+        self.output_dir = output_dir or os.path.join(
+            os.path.dirname(self.abs_bed_path), "temp_processing"
+        )
+        # Use existing Pipeline Manager if it exists
+        self.pm = pm
+
+        if psm is None:
+            pephuburl = "donaldcampbelljr/bedclassifier_tuning_geo:default"
+            self.psm = pipestat.PipestatManager(
+                pephub_path=pephuburl, schema_path="bedclassifier_output_schema.yaml"
+            )
+        else:
+            self.psm = psm
+
+        if self.file_extension == ".gz":
+            unzipped_input_file = os.path.join(self.output_dir, self.file_name)
+
+            with gzip.open(self.input_file, "rb") as f_in:
+                _LOGGER.info(
+                    f"Unzipping file:{self.input_file} and Creating Unzipped file: {unzipped_input_file}"
+                )
+                with open(unzipped_input_file, "wb") as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+            self.input_file = unzipped_input_file
+            if self.pm:
+                self.pm.clean_add(unzipped_input_file)
+
+        try:
+            self.bed_type, self.bed_type_named = get_bed_type(self.input_file)
+        except BedTypeException as e:
+            _LOGGER.warning(msg=f"FAILED {bed_digest}  Exception {e}")
+            self.bed_type = "unknown_bedtype"
+            self.bed_type_named = "unknown_bedtype"
+
+        if self.input_type is not None:
+            if self.bed_type_named != self.input_type:
+                _LOGGER.warning(
+                    f"BED file classified as different type than given input: {self.bed_type} vs {self.input_type}"
+                )
+                do_types_match = False
+            else:
+                do_types_match = True
+        else:
+            do_types_match = False
+
+        # Create Value Dict to report via pipestat
+
+        all_values = {}
+
+        if self.input_type:
+            all_values.update({"given_bedfile_type": self.input_type})
+        if self.bed_type:
+            all_values.update({"bedfile_type": self.bed_type})
+        if self.bed_type_named:
+            all_values.update({"bedfile_named": self.bed_type_named})
+        if self.gsm:
+            all_values.update({"gsm": self.gsm})
+
+        all_values.update({"types_match": do_types_match})
+
+        try:
+            psm.report(record_identifier=bed_digest, values=all_values)
+        except Exception as e:
+            _LOGGER.warning(msg=f"FAILED {bed_digest}  Exception {e}")
+
+        if self.pm:
+            self.pm.stop_pipeline()
+
+
+def main():
+    # PEP for reporting all classification results
+    pephuburl = "donaldcampbelljr/bedclassifier_tuning_geo:default"
+
+    # Place these external to pycharm folder!!!
+    data_output_path = os.path.abspath("data")
+    results_path = os.path.abspath("results")
+    logs_dir = os.path.join(results_path, "logs")
+
+    gse_obj = Finder()
+
+    # # Optionally: provide filter string and max number of retrieve elements
+    # gse_obj = Finder(filters="narrowpeak", retmax=100)
+    #
+    # gse_list = gse_obj.get_gse_all()
+    # gse_obj.generate_file("data/output.txt", gse_list=gse_list)
+
+    pm = pypiper.PipelineManager(
+        name="bedclassifier",
+        outfolder=logs_dir,
+        recover=True,
+    )
+
+    pm.start_pipeline()
+
+    # for geo in gse_list:
+    geofetcher_obj = Geofetcher(
+        filter="\.(bed|narrowPeak|broadPeak)\.",
+        filter_size="25MB",
+        data_source="samples",
+        geo_folder=data_output_path,
+        metadata_folder=data_output_path,
+        processed=True,
+        max_soft_size="20MB",
+        discard_soft=True,
+    )
+
+    # geofetcher_obj.fetch_all(input="data/output.txt", name="donald_test")
+    geofetched = geofetcher_obj.get_projects(
+        input=os.path.join(data_output_path, "output.txt"), just_metadata=False
+    )
+
+    samples = geofetched["output_samples"].samples
+
+    psm = pipestat.PipestatManager(
+        pephub_path=pephuburl, schema_path="bedclassifier_output_schema.yaml"
+    )
+
+    for sample in samples:
+        if isinstance(sample.output_file_path, list):
+            bedfile = sample.output_file_path[0]
+        else:
+            bedfile = sample.output_file_path
+        geo_accession = sample.sample_geo_accession
+        sample_name = sample.sample_name
+        bed_type_from_geo = sample.type.lower()
+
+        bed = BedClassifier(
+            input_file=bedfile,
+            bed_digest=sample_name,  # TODO FIX THIS IT HOULD BE AN ACTUAL DIGEST
+            output_dir=results_path,
+            input_type=bed_type_from_geo,
+            psm=psm,
+            pm=pm,
+            gsm=geo_accession,
+        )
+
+    pm.stop_pipeline()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py
@@ -16,7 +16,6 @@
 
 
 class TestBedClassifier:
-
     def test_classification(
         self,
     ):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Just making a script to download and assess BED files and determine if they are correctly classified.
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,7 +16,6 @@ @@
     class TestBedClassifier:
         def test_classification(
             self,
         ):
@@ Expand Down @@