Merge pull request #44 from databio/restructuring

Restructuring
databio · Feb 26, 2024 · 884d99b · 884d99b
2 parents 3dbd1db + 96ca0a8
commit 884d99b
Show file tree

Hide file tree

Showing 11 changed files with 227 additions and 133 deletions.
diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py
@@ -8,6 +8,7 @@
 import peppy
 from eido import validate_project
 import bbconf
+import subprocess
 
 import pephubclient
 from pephubclient import PEPHubClient
@@ -28,7 +29,7 @@
     BEDSTAT_OUTPUT,
     BED_PEP_REGISTRY,
 )
-from bedboss.models import BedMetadata
+from bedboss.models import BedMetadata, BedStatCLIModel, BedMakerCLIModel, BedQCCLIModel
 from bedboss.utils import (
     extract_file_name,
     standardize_genome_name,
@@ -113,6 +114,18 @@ def load_to_s3(
     pm.run(cmd=command, lock_name="s3_sync_bedstat")
 
 
+def requirements_check() -> None:
+    """
+    Check if all requirements are installed
+
+    :return: None
+    """
+    _LOGGER.info("Checking requirements...")
+    subprocess.run(
+        ["bash", f"{os.path.dirname(os.path.abspath(__file__))}/requirements_test.sh"]
+    )
+
+
 def run_all(
     sample_name: str,
     input_file: str,
@@ -301,7 +314,6 @@ def insert_pep(
     pep: Union[str, peppy.Project],
     rfg_config: str = None,
     create_bedset: bool = True,
-    skip_qdrant: bool = True,
     check_qc: bool = True,
     standardize: bool = False,
     ensdb: str = None,
@@ -310,6 +322,7 @@ def insert_pep(
     force_overwrite: bool = False,
     upload_s3: bool = False,
     upload_pephub: bool = False,
+    upload_qdrant: bool = False,
     pm: pypiper.PipelineManager = None,
     *args,
     **kwargs,
@@ -323,19 +336,22 @@ def insert_pep(
     :param Union[str, peppy.Project] pep: path to the pep file or pephub registry path
     :param str rfg_config: path to the genome config file (refgenie)
     :param bool create_bedset: whether to create bedset
-    :param bool skip_qdrant: whether to skip qdrant indexing
+    :param bool upload_qdrant: whether to upload bedfiles to qdrant
     :param bool check_qc: whether to run quality control during badmaking
     :param bool standardize: "Standardize bed files: remove non-standard chromosomes and headers if necessary Default: False"
     :param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata
-    :param bool just_db_commit: whether just to commit the JSON to the database
-    :param bool no_db_commit: whether the JSON commit to the database should be skipped
+    :param bool just_db_commit: whether save only to the database (Without saving locally )
+    :param bool db_commit: whether to upload data to the database
     :param bool force_overwrite: whether to overwrite the existing record
     :param bool upload_s3: whether to upload to s3
     :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
+    :param bool upload_qdrant: whether to execute qdrant indexing
     :param pypiper.PipelineManager pm: pypiper object
     :return: None
     """
 
+    _LOGGER.warning(f"!Unused arguments: {kwargs}")
+    failed_samples = []
     pephub_registry_path = None
     if isinstance(pep, peppy.Project):
         pass
@@ -354,36 +370,41 @@ def insert_pep(
 
     for i, pep_sample in enumerate(pep.samples):
         _LOGGER.info(f"Running bedboss pipeline for {pep_sample.sample_name}")
-
-        if pep_sample.get("file_type").lower() == "narrowpeak":
-            is_narrow_peak = True
+        if pep_sample.get("file_type"):
+            if pep_sample.get("file_type").lower() == "narrowpeak":
+                is_narrow_peak = True
+            else:
+                is_narrow_peak = False
         else:
             is_narrow_peak = False
-
-        bed_id = run_all(
-            sample_name=pep_sample.sample_name,
-            input_file=pep_sample.input_file,
-            input_type=pep_sample.input_type,
-            genome=pep_sample.genome,
-            narrowpeak=is_narrow_peak,
-            chrom_sizes=pep_sample.get("chrom_sizes"),
-            open_signal_matrix=pep_sample.get("open_signal_matrix"),
-            other_metadata=pep_sample.to_dict(),
-            outfolder=output_folder,
-            bedbase_config=bbc,
-            rfg_config=rfg_config,
-            check_qc=check_qc,
-            standardize=standardize,
-            ensdb=ensdb,
-            just_db_commit=just_db_commit,
-            no_db_commit=no_db_commit,
-            force_overwrite=force_overwrite,
-            skip_qdrant=skip_qdrant,
-            upload_s3=upload_s3,
-            upload_pephub=upload_pephub,
-            pm=pm,
-        )
-        pep.samples[i].record_identifier = bed_id
+        try:
+            bed_id = run_all(
+                sample_name=pep_sample.sample_name,
+                input_file=pep_sample.input_file,
+                input_type=pep_sample.input_type,
+                genome=pep_sample.genome,
+                narrowpeak=is_narrow_peak,
+                chrom_sizes=pep_sample.get("chrom_sizes"),
+                open_signal_matrix=pep_sample.get("open_signal_matrix"),
+                other_metadata=pep_sample.to_dict(),
+                outfolder=output_folder,
+                bedbase_config=bbc,
+                rfg_config=rfg_config,
+                check_qc=check_qc,
+                standardize=standardize,
+                ensdb=ensdb,
+                just_db_commit=just_db_commit,
+                no_db_commit=no_db_commit,
+                force_overwrite=force_overwrite,
+                upload_qdrant=upload_qdrant,
+                upload_s3=upload_s3,
+                upload_pephub=upload_pephub,
+                pm=pm,
+            )
+            pep.samples[i].record_identifier = bed_id
+        except BedBossException as e:
+            _LOGGER.error(f"Failed to process {pep_sample.sample_name}. See {e}")
+            failed_samples.append(pep_sample.sample_name)
 
     else:
         _LOGGER.info("Skipping uploading to s3. Flag `upload_s3` is set to False")
@@ -394,11 +415,13 @@ def insert_pep(
             bedbase_config=bbc,
             bedset_pep=pep,
             pephub_registry_path=pephub_registry_path,
+            upload_pephub=upload_pephub,
         )
     else:
         _LOGGER.info(
             f"Skipping bedset creation. Create_bedset is set to {create_bedset}"
         )
+    _LOGGER.info(f"Failed samples: {failed_samples}")
 
 
 def main(test_args: dict = None) -> NoReturn:
@@ -423,28 +446,30 @@ def main(test_args: dict = None) -> NoReturn:
         or "test_outfolder",
     )
     pm_out_folder = os.path.join(os.path.abspath(pm_out_folder[0]), "pipeline_manager")
-
     pm = pypiper.PipelineManager(
         name="bedboss-pipeline",
         outfolder=pm_out_folder,
         version=__version__,
-        args=args,
+        # args=args,
         multi=args_dict.get("multy", False),
+        recover=True,
     )
     if args_dict["command"] == "all":
         run_all(pm=pm, **args_dict)
     elif args_dict["command"] == "insert":
         insert_pep(pm=pm, **args_dict)
     elif args_dict["command"] == "make":
-        make_all(pm=pm, **args_dict)
+        make_all(**BedMakerCLIModel(pm=pm, **args_dict).model_dump())
     elif args_dict["command"] == "qc":
-        bedqc(pm=pm, **args_dict)
+        bedqc(**BedQCCLIModel(pm=pm, **args_dict).model_dump())
     elif args_dict["command"] == "stat":
-        bedstat(pm=pm, **args_dict)
+        bedstat(**BedStatCLIModel(pm=pm, **args_dict).model_dump())
     elif args_dict["command"] == "bunch":
         run_bedbuncher(pm=pm, **args_dict)
     elif args_dict["command"] == "index":
         add_to_qdrant(pm=pm, **args_dict)
+    elif args_dict["command"] == "requirements-check":
+        requirements_check()
     else:
         parser.print_help()
         # raise Exception("Incorrect pipeline name.")

diff --git a/bedboss/bedbuncher/bedbuncher.py b/bedboss/bedbuncher/bedbuncher.py
@@ -12,11 +12,12 @@
 import pephubclient
 from pephubclient.helpers import is_registry_path
 import logging
+from ubiquerg import parse_registry_path
 
 from bedboss.const import (
     DEFAULT_BEDBASE_API_URL,
     DEFAULT_BEDBASE_CACHE_PATH,
-    OUTPUT_FOLDER_NAME,
+    BED_PEP_REGISTRY,
 )
 
 
@@ -37,11 +38,14 @@ def create_bedset_from_pep(
     _LOGGER.info("Creating bedset from pep.")
     new_bedset = BedSet()
     for bedfile_id in pep.samples:
-        bedfile_object = BBClient(
-            cache_folder=cache_folder,
-            bedbase_api=bedbase_api,
-        ).load_bed(bedfile_id.get("record_identifier") or bedfile_id.sample_name)
-        new_bedset.add(bedfile_object)
+        try:
+            bedfile_object = BBClient(
+                cache_folder=cache_folder,
+                bedbase_api=bedbase_api,
+            ).load_bed(bedfile_id.get("record_identifier") or bedfile_id.sample_name)
+            new_bedset.add(bedfile_object)
+        except Exception as err:
+            pass
     _LOGGER.info("Bedset was created successfully")
     return new_bedset
 
@@ -231,6 +235,7 @@ def run_bedbuncher(
     bedbase_api: str = DEFAULT_BEDBASE_API_URL,
     cache_path: str = DEFAULT_BEDBASE_CACHE_PATH,
     heavy: bool = False,
+    upload_pephub: bool = False,
     *args,
     **kwargs,
 ) -> None:
@@ -244,6 +249,7 @@ def run_bedbuncher(
     :param cache_path: path to the cache folder [DEFAULT: ./bedbase_cache]
     :param heavy: whether to use heavy processing (add all columns to the database).
         if False -> R-script won't be executed, only basic statistics will be calculated
+    :param upload_pephub: whether to upload bedset to pephub
     :return: None
     """
 
@@ -278,22 +284,41 @@ def run_bedbuncher(
         _LOGGER.warning(
             f"Description for bedset {bedset_name or pep_of_bed.get('name')} was not provided."
         )
-
-    add_bedset_to_database(
-        bbc,
-        record_id=bedset_name or pep_of_bed.name,
-        bed_set=bedset,
-        bedset_name=bedset_name or pep_of_bed.name,
-        genome=dict(pep_of_bed.config.get("genome", {})),
-        description=pep_of_bed.description or "",
-        pephub_registry_path=pephub_registry_path,
-        heavy=heavy,
-    )
+    record_id = bedset_name or pep_of_bed.name
+    try:
+        add_bedset_to_database(
+            bbc,
+            record_id=record_id,
+            bed_set=bedset,
+            bedset_name=bedset_name or pep_of_bed.name,
+            genome=dict(pep_of_bed.config.get("genome", {})),
+            description=pep_of_bed.description or "",
+            # pephub_registry_path=pephub_registry_path,
+            heavy=heavy,
+        )
+    except Exception as err:
+        pass
+    if upload_pephub:
+        phc = pephubclient.PEPHubClient()
+        reg_path_obj = parse_registry_path(BED_PEP_REGISTRY)
+        bed_ids = [
+            sample.record_identifier
+            for sample in pep_of_bed.samples
+            if sample.get("record_identifier") is not None
+        ]
+        print(bed_ids)
+        phc.view.create(
+            namespace=reg_path_obj["namespace"],
+            name=reg_path_obj["item"],
+            tag=reg_path_obj["tag"],
+            view_name=record_id,
+            sample_list=bed_ids,
+        )
     return None
 
 
-if __name__ == "__main__":
-    run_bedbuncher(
-        "/media/alex/Extreme SSD/databio/repos/bedbase_all/bedhost/bedbase_configuration_compose.yaml",
-        "databio/excluderanges:id3",
-    )
+# if __name__ == "__main__":
+#     run_bedbuncher(
+#         "/media/alex/Extreme SSD/databio/repos/bedbase_all/bedhost/bedbase_configuration_compose.yaml",
+#         "databio/excluderanges:id3",
+#     )
diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py
@@ -61,7 +61,6 @@ def __init__(
         standardize: bool = False,
         check_qc: bool = True,
         pm: pypiper.PipelineManager = None,
-        **kwargs,
     ):
         """
         Pypiper pipeline to convert supported file formats into
@@ -186,11 +185,16 @@ def make(self) -> dict:
             # we need this exception to catch the case when the input file is not a bed file
             bed_type, bed_format = get_bed_type(self.output_bed)
         if self.check_qc:
-            bedqc(
-                self.output_bed,
-                outfolder=os.path.join(self.bed_parent, QC_FOLDER_NAME),
-                pm=self.pm,
-            )
+            try:
+                bedqc(
+                    self.output_bed,
+                    outfolder=os.path.join(self.bed_parent, QC_FOLDER_NAME),
+                    pm=self.pm,
+                )
+            except Exception as e:
+                raise BedBossException(
+                    f"Quality control failed for {self.output_bed}. Error: {e}"
+                )
 
         self.make_bigbed(bed_type=bed_type)
 
@@ -356,7 +360,7 @@ def copy_with_standardization(self):
             except (pd.errors.ParserError, pd.errors.EmptyDataError) as e:
                 if row_count <= max_rows:
                     row_count += 1
-        if not df:
+        if not isinstance(df, pd.DataFrame):
             raise BedBossException(
                 reason=f"Bed file is broken and could not be parsed due to CSV parse error."
             )

diff --git a/bedboss/bedqc/bedqc.py b/bedboss/bedqc/bedqc.py
@@ -18,7 +18,6 @@ def bedqc(
     max_region_number: int = MAX_REGION_NUMBER,
     min_region_width: int = MIN_REGION_WIDTH,
     pm: pypiper.PipelineManager = None,
-    **kwargs,
 ) -> bool:
     """
     Perform quality checks on a BED file.
@@ -30,9 +29,9 @@ def bedqc(
     :param min_region_width: Minimum region width threshold to pass the quality check.
     :param pm: Pypiper object for managing pipeline operations.
     :return: True if the file passes the quality check.
+    :raises QualityException: if the file does not pass the quality
     """
     _LOGGER.info("Running bedqc...")
-    _LOGGER.warning(f"Unused arguments: {kwargs}")
 
     output_file = os.path.join(outfolder, "failed_qc.csv")
     bedfile_name = os.path.basename(bedfile)

diff --git a/bedboss/bedstat/bedstat.py b/bedboss/bedstat/bedstat.py
@@ -74,7 +74,6 @@ def bedstat(
     open_signal_matrix: str = None,
     just_db_commit: bool = False,
     pm: pypiper.PipelineManager = None,
-    **kwargs,
 ) -> dict:
     """
     Run bedstat pipeline - pipeline for obtaining statistics about bed files