Merge pull request #33 from databio/dev_phc_bed_Metadata

Allow uploading meta data from input pep to an output pep via registry path
databio · Feb 12, 2024 · 52aba4d · 52aba4d
2 parents 5054da2 + 9bcaee4
commit 52aba4d
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 0 deletions.
diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py
@@ -87,6 +87,7 @@ def run_all(
     open_signal_matrix: str = None,
     ensdb: str = None,
     treatment: str = None,
+    pep_sample_dict: dict = None,
     description: str = None,
     cell_type: str = None,
     other_metadata: dict = None,
@@ -95,6 +96,7 @@ def run_all(
     force_overwrite: bool = False,
     skip_qdrant: bool = True,
     upload_s3: bool = False,
+    upload_pephub: bool = False,
     pm: pypiper.PipelineManager = None,
     **kwargs,
 ) -> str:
@@ -116,6 +118,7 @@ def run_all(
         :param str description: a description of the bed file
     :param str open_signal_matrix: a full path to the openSignalMatrix required for the tissue [optional]
     :param str treatment: a treatment of the bed file
+    :param dict pep_sample_dict: a dict containing all attributes from the sample
     :param str cell_type: a cell type of the bed file
     :param dict other_metadata: a dictionary of other metadata to pass
     :param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata [optional]
@@ -125,6 +128,7 @@ def run_all(
     :param bool no_db_commit: whether the JSON commit to the database should be skipped (default: False)
     :param bool skip_qdrant: whether to skip qdrant indexing
     :param bool upload_s3: whether to upload to s3
+    :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
     :param pypiper.PipelineManager pm: pypiper object
     :return str bed_digest: bed digest
     """
@@ -191,13 +195,15 @@ def run_all(
         bigbed=output_bigbed,
         description=description,
         treatment=treatment,
+        pep_sample_dict=pep_sample_dict,
         cell_type=cell_type,
         other_metadata=other_metadata,
         just_db_commit=just_db_commit,
         no_db_commit=no_db_commit,
         force_overwrite=force_overwrite,
         skip_qdrant=skip_qdrant,
         upload_s3=upload_s3,
+        upload_pephub=upload_pephub,
         pm=pm,
     )
     return bed_digest
@@ -217,6 +223,7 @@ def insert_pep(
     no_db_commit: bool = False,
     force_overwrite: bool = False,
     upload_s3: bool = False,
+    upload_pephub: bool = False,
     pm: pypiper.PipelineManager = None,
     *args,
     **kwargs,
@@ -238,6 +245,7 @@ def insert_pep(
     :param bool no_db_commit: whether the JSON commit to the database should be skipped
     :param bool force_overwrite: whether to overwrite the existing record
     :param bool upload_s3: whether to upload to s3
+    :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
     :param pypiper.PipelineManager pm: pypiper object
     :return: None
     """
@@ -277,6 +285,7 @@ def insert_pep(
             description=pep_sample.get("description"),
             cell_type=pep_sample.get("cell_type"),
             treatment=pep_sample.get("treatment"),
+            pep_sample_dict=pep_sample.to_dict(),
             outfolder=output_folder,
             bedbase_config=bbc,
             rfg_config=rfg_config,
@@ -288,6 +297,7 @@ def insert_pep(
             force_overwrite=force_overwrite,
             skip_qdrant=skip_qdrant,
             upload_s3=upload_s3,
+            upload_pephub=upload_pephub,
             pm=pm,
         )
         pep.samples[i].record_identifier = bed_id

diff --git a/bedboss/bedstat/bedstat.py b/bedboss/bedstat/bedstat.py
@@ -5,7 +5,11 @@
 import pypiper
 import bbconf
 import logging
+import pephubclient as phc
 from geniml.io import RegionSet
+from pephubclient import PEPHubClient
+from pephubclient.helpers import is_registry_path
+from ubiquerg import parse_registry_path
 
 from bedboss.const import (
     OUTPUT_FOLDER_NAME,
@@ -21,6 +25,8 @@
     os.path.dirname(os.path.realpath(__file__)), "pep_schema.yaml"
 )
 
+BED_PEP_REGISTRY = "databio/allbeds:bedbase"
+
 
 def convert_unit(size_in_bytes: int) -> str:
     """
@@ -38,6 +44,48 @@ def convert_unit(size_in_bytes: int) -> str:
         return str(round(size_in_bytes / (1024 * 1024 * 1024))) + "GB"
 
 
+def load_to_pephub(
+    pep_registry_path: str, bed_digest: str, genome: str, metadata: dict
+) -> None:
+    """
+    Load bedfile and metadata to PEPHUB
+
+    :param str pep_registry_path: registry path to pep on pephub
+    :param str bed_digest: unique bedfile identifier
+    :param str genome: genome associated with bedfile
+    :param dict metadata: Any other metadata that has been collected
+
+    :return None
+    """
+
+    if is_registry_path(pep_registry_path):
+        parsed_pep_dict = parse_registry_path(pep_registry_path)
+
+        # Combine data into a dict for sending to pephub
+        sample_data = {}
+        sample_data.update({"sample_name": bed_digest, "genome": genome})
+
+        for key, value in metadata.items():
+            # TODO Confirm this key is in the schema
+            # Then update sample_data
+            sample_data.update({key: value})
+
+        try:
+            PEPHubClient().sample.create(
+                namespace=parsed_pep_dict["namespace"],
+                name=parsed_pep_dict["item"],
+                tag=parsed_pep_dict["item"],
+                sample_name=bed_digest,
+                overwrite=True,
+                sample_dict=sample_data,
+            )
+
+        except Exception as e:  # Need more specific exception
+            _LOGGER.warning(f"Failed to upload BEDFILE to Bedbase: See {e}")
+    else:
+        _LOGGER.warning(f"{pep_registry_path} is not a valid registry path")
+
+
 def load_to_s3(
     output_folder: str,
     pm: pypiper.PipelineManager,
@@ -76,6 +124,7 @@ def bedstat(
     open_signal_matrix: str = None,
     bigbed: str = None,
     treatment: str = None,
+    pep_sample_dict: dict = None,
     description: str = None,
     cell_type: str = None,
     other_metadata: dict = None,
@@ -84,6 +133,7 @@ def bedstat(
     force_overwrite: bool = False,
     skip_qdrant: bool = True,
     upload_s3: bool = False,
+    upload_pephub: bool = False,
     pm: pypiper.PipelineManager = None,
     **kwargs,
 ) -> str:
@@ -104,6 +154,7 @@ def bedstat(
         not in GDdata
     :param str description: a description of the bed file
     :param str treatment: a treatment of the bed file
+    :param dict pep_sample_dict: a dict containing all attributes from the sample
     :param str cell_type: a cell type of the bed file
     :param dict other_metadata: a dictionary of other metadata to pass
     :param bool just_db_commit: whether just to commit the JSON to the database
@@ -112,12 +163,14 @@ def bedstat(
     :param skip_qdrant: whether to skip qdrant indexing [Default: True]
     :param bool force_overwrite: whether to overwrite the existing record
     :param upload_s3: whether to upload the bed file to s3
+    :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
     :param pm: pypiper object
 
     :return: bed_digest: the digest of the bed file
     """
     # TODO why are we no longer using bbconf to get the output path?
     # outfolder_stats = bbc.get_bedstat_output_path()
+
     outfolder_stats = os.path.join(outfolder, OUTPUT_FOLDER_NAME, BEDSTAT_OUTPUT)
     try:
         os.makedirs(outfolder_stats)
@@ -211,6 +264,11 @@ def bedstat(
             }
         )
 
+        # For now, add all the *other* attributes to other_metadata
+        for key, value in pep_sample_dict.items():
+            if key not in list(other_metadata.keys()):
+                other_metadata.update({key: value})
+
         # unlist the data, since the output of regionstat.R is a dict of lists of
         # length 1 and force keys to lower to correspond with the
         # postgres column identifiers
@@ -295,6 +353,15 @@ def bedstat(
             force_overwrite=True,
         )
 
+    if upload_pephub:
+        _LOGGER.info("UPLOADING TO PEPHUB...")
+        load_to_pephub(
+            pep_registry_path=BED_PEP_REGISTRY,
+            bed_digest=bed_digest,
+            genome=genome,
+            metadata=other_metadata,
+        )
+
     if stop_pipeline:
         pm.stop_pipeline()
     return bed_digest
diff --git a/bedboss/cli.py b/bedboss/cli.py
@@ -166,6 +166,11 @@ def build_argparser() -> ArgumentParser:
         action="store_true",
         help="whether to skip qdrant indexing",
     )
+    sub_all.add_argument(
+        "--upload-pephub",
+        action="store_true",
+        help="upload to pephub",
+    )
 
     # all-pep
     sub_all_pep.add_argument(
@@ -245,6 +250,11 @@ def build_argparser() -> ArgumentParser:
         "Before uploading you have to set up all necessury env vars: "
         "AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_ENDPOINT_URL. [Default: False]",
     )
+    sub_all_pep.add_argument(
+        "--upload-pephub",
+        action="store_true",
+        help="upload to pephub",
+    )
 
     # bed_qc
     sub_qc.add_argument(