Merge pull request #40 from databio/restructuring

Restructuring
databio · Feb 21, 2024 · 3dbd1db · 3dbd1db
2 parents 57bc897 + ffa08e9
commit 3dbd1db
Show file tree

Hide file tree

Showing 10 changed files with 255 additions and 335 deletions.
diff --git a/bedboss/__init__.py b/bedboss/__init__.py
@@ -71,3 +71,10 @@
     datefmt="%H:%M:%S",
     fmt="[%(levelname)s] [%(asctime)s] [BBCONF] %(message)s",
 )
+
+_LOGGER_BBCONF = logging.getLogger("pephubclient")
+coloredlogs.install(
+    logger=_LOGGER_BBCONF,
+    datefmt="%H:%M:%S",
+    fmt="[%(levelname)s] [%(asctime)s] [PEPHUBCLIENT] %(message)s",
+)
diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py
@@ -7,9 +7,12 @@
 import logmuse
 import peppy
 from eido import validate_project
+import bbconf
+
 import pephubclient
+from pephubclient import PEPHubClient
 from pephubclient.helpers import is_registry_path
-import bbconf
+from ubiquerg import parse_registry_path
 
 from bedboss.bedstat.bedstat import bedstat
 from bedboss.bedmaker.bedmaker import make_all
@@ -21,7 +24,11 @@
     BED_FOLDER_NAME,
     BIGBED_FOLDER_NAME,
     BEDBOSS_PEP_SCHEMA_PATH,
+    OUTPUT_FOLDER_NAME,
+    BEDSTAT_OUTPUT,
+    BED_PEP_REGISTRY,
 )
+from bedboss.models import BedMetadata
 from bedboss.utils import (
     extract_file_name,
     standardize_genome_name,
@@ -33,6 +40,79 @@
 _LOGGER = logging.getLogger("bedboss")
 
 
+def load_to_pephub(
+    pep_registry_path: str, bed_digest: str, genome: str, metadata: dict
+) -> None:
+    """
+    Load bedfile and metadata to PEPHUB
+
+    :param str pep_registry_path: registry path to pep on pephub
+    :param str bed_digest: unique bedfile identifier
+    :param str genome: genome associated with bedfile
+    :param dict metadata: Any other metadata that has been collected
+
+    :return None
+    """
+
+    if is_registry_path(pep_registry_path):
+        parsed_pep_dict = parse_registry_path(pep_registry_path)
+
+        # Combine data into a dict for sending to pephub
+        sample_data = {}
+        sample_data.update({"sample_name": bed_digest, "genome": genome})
+
+        metadata = BedMetadata(**metadata).model_dump()
+
+        for key, value in metadata.items():
+            # TODO: Confirm this key is in the schema
+            # Then update sample_data
+            sample_data.update({key: value})
+
+        try:
+            PEPHubClient().sample.create(
+                namespace=parsed_pep_dict["namespace"],
+                name=parsed_pep_dict["item"],
+                tag=parsed_pep_dict["tag"],
+                sample_name=bed_digest,
+                overwrite=True,
+                sample_dict=sample_data,
+            )
+
+        except Exception as e:  # Need more specific exception
+            _LOGGER.error(f"Failed to upload BEDFILE to PEPhub: See {e}")
+    else:
+        _LOGGER.error(f"{pep_registry_path} is not a valid registry path")
+
+
+def load_to_s3(
+    output_folder: str,
+    pm: pypiper.PipelineManager,
+    bed_file: str,
+    digest: str,
+    bigbed_file: str = None,
+) -> None:
+    """
+    Load bedfiles and statistics to s3
+
+    :param output_folder: base output folder
+    :param pm: pipelineManager object
+    :param bed_file: bedfile name
+    :param digest: bedfile digest
+    :param bigbed_file: bigbed file name
+    :return: NoReturn
+    """
+    command = f"aws s3 cp {os.path.join(output_folder, bed_file)} s3://bedbase/{BED_FOLDER_NAME}"
+    _LOGGER.info("Uploading to s3 bed file")
+    pm.run(cmd=command, lock_name="s3_sync_bed")
+    if bigbed_file:
+        command = f"aws s3 cp {os.path.join(output_folder, bigbed_file)} s3://bedbase/{BIGBED_FOLDER_NAME}"
+        _LOGGER.info("Uploading to s3 bigbed file")
+        pm.run(cmd=command, lock_name="s3_sync_bigbed")
+    command = f"aws s3 sync {os.path.join(output_folder, OUTPUT_FOLDER_NAME,BEDSTAT_OUTPUT, digest)} s3://bedbase/{OUTPUT_FOLDER_NAME}/{BEDSTAT_OUTPUT}/{digest} --size-only"
+    _LOGGER.info("Uploading to s3 bed statistic files")
+    pm.run(cmd=command, lock_name="s3_sync_bedstat")
+
+
 def run_all(
     sample_name: str,
     input_file: str,
@@ -49,9 +129,9 @@ def run_all(
     ensdb: str = None,
     other_metadata: dict = None,
     just_db_commit: bool = False,
-    no_db_commit: bool = False,
+    db_commit: bool = True,
     force_overwrite: bool = False,
-    skip_qdrant: bool = True,
+    upload_qdrant: bool = False,
     upload_s3: bool = False,
     upload_pephub: bool = False,
     pm: pypiper.PipelineManager = None,
@@ -79,8 +159,9 @@ def run_all(
         (basically genomes that's not in GDdata)
     :param bool just_db_commit: whether just to commit the JSON to the database (default: False)
     :param bool force_overwrite: force overwrite analysis
-    :param bool no_db_commit: whether the JSON commit to the database should be skipped (default: False)
-    :param bool skip_qdrant: whether to skip qdrant indexing
+
+    :param bool db_commit: whether the JSON commit to the database should be skipped (default: False)
+    :param bool upload_qdrant: whether to skip qdrant indexing
     :param bool upload_s3: whether to upload to s3
     :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
     :param pypiper.PipelineManager pm: pypiper object
@@ -91,6 +172,9 @@ def run_all(
     if isinstance(bedbase_config, str):
         if not check_db_connection(bedbase_config=bedbase_config):
             raise BedBossException("Database connection failed. Exiting...")
+        bbc = bbconf.BedBaseConf(config_path=bedbase_config, database_only=True)
+    else:
+        bbc = bedbase_config
 
     file_name = extract_file_name(input_file)
     genome = standardize_genome_name(genome)
@@ -135,27 +219,76 @@ def run_all(
         pm=pm,
     )
     if not other_metadata:
-        other_metadata = classification_meta
-    else:
-        other_metadata.update(classification_meta)
+        other_metadata = {}
+
+    bed_digest = classification_meta.get("digest")
 
-    bed_digest = bedstat(
+    statistics_dict = bedstat(
         bedfile=output_bed,
         outfolder=outfolder,
-        bedbase_config=bedbase_config,
         genome=genome,
         ensdb=ensdb,
+        bed_digest=bed_digest,
         open_signal_matrix=open_signal_matrix,
         bigbed=output_bigbed,
-        other_metadata=other_metadata,
         just_db_commit=just_db_commit,
-        no_db_commit=no_db_commit,
-        force_overwrite=force_overwrite,
-        skip_qdrant=skip_qdrant,
-        upload_s3=upload_s3,
-        upload_pephub=upload_pephub,
         pm=pm,
     )
+    statistics_dict.update(
+        {
+            "bed_type": classification_meta["bed_type"],
+            "bed_format": classification_meta["bed_format"],
+        }
+    )
+
+    if db_commit:
+        bbc.bed.report(
+            record_identifier=bed_digest,
+            values=statistics_dict,
+            force_overwrite=force_overwrite,
+        )
+
+    if upload_s3:
+        _LOGGER.info(f"Uploading '{bed_digest}' data to S3 ...")
+        load_to_s3(
+            os.path.abspath(outfolder), pm, output_bed, bed_digest, output_bigbed
+        )
+    else:
+        _LOGGER.info(
+            f"Skipping uploading '{bed_digest}' data to S3. 'upload_s3' is set to False. "
+        )
+
+    if upload_qdrant:
+        _LOGGER.info(f"Adding '{bed_digest}' vector to Qdrant ...")
+
+        bbc.add_bed_to_qdrant(
+            bed_id=bed_digest,
+            bed_file=output_bed,
+            payload={"digest": bed_digest},
+        )
+        bbc.bed.report(
+            record_identifier=bed_digest,
+            values={"added_to_qdrant": True},
+            force_overwrite=True,
+        )
+    else:
+        _LOGGER.info(
+            f"Skipping adding '{bed_digest}' vector to Qdrant, 'skip_qdrant' is set to True. "
+        )
+
+    if upload_pephub:
+        _LOGGER.info(f"Uploading metadata of '{bed_digest}' TO PEPhub ...")
+        load_to_pephub(
+            pep_registry_path=BED_PEP_REGISTRY,
+            bed_digest=bed_digest,
+            genome=genome,
+            metadata=other_metadata,
+        )
+    else:
+        _LOGGER.info(
+            f"Metadata of '{bed_digest}' is NOT uploaded to PEPhub. 'upload_pephub' is set to False. "
+        )
+
     if stop_pipeline:
         pm.stop_pipeline()
 

diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py
@@ -18,6 +18,7 @@
 from refgenconf.exceptions import MissingGenomeError
 from yacman.exceptions import UndefinedAliasError
 from ubiquerg import is_command_callable
+from geniml.io import RegionSet
 
 from bedboss.bedclassifier.bedclassifier import get_bed_type
 from bedboss.bedqc.bedqc import bedqc
@@ -180,10 +181,10 @@ def make(self) -> dict:
         # converting to bed.gz if needed
         self.make_bed()
         try:
-            bed_type, format_type = get_bed_type(self.input_file)
+            bed_type, bed_format = get_bed_type(self.input_file)
         except Exception:
             # we need this exception to catch the case when the input file is not a bed file
-            bed_type, format_type = get_bed_type(self.output_bed)
+            bed_type, bed_format = get_bed_type(self.output_bed)
         if self.check_qc:
             bedqc(
                 self.output_bed,
@@ -195,8 +196,9 @@ def make(self) -> dict:
 
         return {
             "bed_type": bed_type,
-            "file_type": format_type,
+            "bed_format": bed_format,
             "genome": self.genome,
+            "digest": RegionSet(self.output_bed).identifier,
         }
 
     def make_bed(self) -> None:
@@ -549,7 +551,13 @@ def make_all(
         ChrUn chromosomes
     :param check_qc: run quality control during bedmaking
     :param pm: pypiper object
-    :return: dict with bed classificator results
+    :return: dict with generated bed metadata:
+        {
+            "bed_type": bed_type. e.g. bed, bigbed
+            "bed_format": bed_format. e.g. narrowpeak, broadpeak
+            "genome": genome of the sample,
+            "digest": bedfile identifier,
+        }
     """
     return BedMaker(
         input_file=input_file,