diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index d45fd0f..bc83dd6 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -87,6 +87,7 @@ def run_all( open_signal_matrix: str = None, ensdb: str = None, treatment: str = None, + pep_sample_dict: dict = None, description: str = None, cell_type: str = None, other_metadata: dict = None, @@ -95,6 +96,7 @@ def run_all( force_overwrite: bool = False, skip_qdrant: bool = True, upload_s3: bool = False, + upload_pephub: bool = False, pm: pypiper.PipelineManager = None, **kwargs, ) -> str: @@ -116,6 +118,7 @@ def run_all( :param str description: a description of the bed file :param str open_signal_matrix: a full path to the openSignalMatrix required for the tissue [optional] :param str treatment: a treatment of the bed file + :param dict pep_sample_dict: a dict containing all attributes from the sample :param str cell_type: a cell type of the bed file :param dict other_metadata: a dictionary of other metadata to pass :param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata [optional] @@ -125,6 +128,7 @@ def run_all( :param bool no_db_commit: whether the JSON commit to the database should be skipped (default: False) :param bool skip_qdrant: whether to skip qdrant indexing :param bool upload_s3: whether to upload to s3 + :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param pypiper.PipelineManager pm: pypiper object :return str bed_digest: bed digest """ @@ -191,6 +195,7 @@ def run_all( bigbed=output_bigbed, description=description, treatment=treatment, + pep_sample_dict=pep_sample_dict, cell_type=cell_type, other_metadata=other_metadata, just_db_commit=just_db_commit, @@ -198,6 +203,7 @@ def run_all( force_overwrite=force_overwrite, skip_qdrant=skip_qdrant, upload_s3=upload_s3, + upload_pephub=upload_pephub, pm=pm, ) return bed_digest @@ -217,6 +223,7 @@ def insert_pep( no_db_commit: bool = False, force_overwrite: bool = False, upload_s3: bool = False, + upload_pephub: bool = False, pm: pypiper.PipelineManager = None, *args, **kwargs, @@ -238,6 +245,7 @@ def insert_pep( :param bool no_db_commit: whether the JSON commit to the database should be skipped :param bool force_overwrite: whether to overwrite the existing record :param bool upload_s3: whether to upload to s3 + :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param pypiper.PipelineManager pm: pypiper object :return: None """ @@ -277,6 +285,7 @@ def insert_pep( description=pep_sample.get("description"), cell_type=pep_sample.get("cell_type"), treatment=pep_sample.get("treatment"), + pep_sample_dict=pep_sample.to_dict(), outfolder=output_folder, bedbase_config=bbc, rfg_config=rfg_config, @@ -288,6 +297,7 @@ def insert_pep( force_overwrite=force_overwrite, skip_qdrant=skip_qdrant, upload_s3=upload_s3, + upload_pephub=upload_pephub, pm=pm, ) pep.samples[i].record_identifier = bed_id diff --git a/bedboss/bedstat/bedstat.py b/bedboss/bedstat/bedstat.py index d3ec79f..034d5b7 100755 --- a/bedboss/bedstat/bedstat.py +++ b/bedboss/bedstat/bedstat.py @@ -5,7 +5,11 @@ import pypiper import bbconf import logging +import pephubclient as phc from geniml.io import RegionSet +from pephubclient import PEPHubClient +from pephubclient.helpers import is_registry_path +from ubiquerg import parse_registry_path from bedboss.const import ( OUTPUT_FOLDER_NAME, @@ -21,6 +25,8 @@ os.path.dirname(os.path.realpath(__file__)), "pep_schema.yaml" ) +BED_PEP_REGISTRY = "databio/allbeds:bedbase" + def convert_unit(size_in_bytes: int) -> str: """ @@ -38,6 +44,48 @@ def convert_unit(size_in_bytes: int) -> str: return str(round(size_in_bytes / (1024 * 1024 * 1024))) + "GB" +def load_to_pephub( + pep_registry_path: str, bed_digest: str, genome: str, metadata: dict +) -> None: + """ + Load bedfile and metadata to PEPHUB + + :param str pep_registry_path: registry path to pep on pephub + :param str bed_digest: unique bedfile identifier + :param str genome: genome associated with bedfile + :param dict metadata: Any other metadata that has been collected + + :return None + """ + + if is_registry_path(pep_registry_path): + parsed_pep_dict = parse_registry_path(pep_registry_path) + + # Combine data into a dict for sending to pephub + sample_data = {} + sample_data.update({"sample_name": bed_digest, "genome": genome}) + + for key, value in metadata.items(): + # TODO Confirm this key is in the schema + # Then update sample_data + sample_data.update({key: value}) + + try: + PEPHubClient().sample.create( + namespace=parsed_pep_dict["namespace"], + name=parsed_pep_dict["item"], + tag=parsed_pep_dict["item"], + sample_name=bed_digest, + overwrite=True, + sample_dict=sample_data, + ) + + except Exception as e: # Need more specific exception + _LOGGER.warning(f"Failed to upload BEDFILE to Bedbase: See {e}") + else: + _LOGGER.warning(f"{pep_registry_path} is not a valid registry path") + + def load_to_s3( output_folder: str, pm: pypiper.PipelineManager, @@ -76,6 +124,7 @@ def bedstat( open_signal_matrix: str = None, bigbed: str = None, treatment: str = None, + pep_sample_dict: dict = None, description: str = None, cell_type: str = None, other_metadata: dict = None, @@ -84,6 +133,7 @@ def bedstat( force_overwrite: bool = False, skip_qdrant: bool = True, upload_s3: bool = False, + upload_pephub: bool = False, pm: pypiper.PipelineManager = None, **kwargs, ) -> str: @@ -104,6 +154,7 @@ def bedstat( not in GDdata :param str description: a description of the bed file :param str treatment: a treatment of the bed file + :param dict pep_sample_dict: a dict containing all attributes from the sample :param str cell_type: a cell type of the bed file :param dict other_metadata: a dictionary of other metadata to pass :param bool just_db_commit: whether just to commit the JSON to the database @@ -112,12 +163,14 @@ def bedstat( :param skip_qdrant: whether to skip qdrant indexing [Default: True] :param bool force_overwrite: whether to overwrite the existing record :param upload_s3: whether to upload the bed file to s3 + :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param pm: pypiper object :return: bed_digest: the digest of the bed file """ # TODO why are we no longer using bbconf to get the output path? # outfolder_stats = bbc.get_bedstat_output_path() + outfolder_stats = os.path.join(outfolder, OUTPUT_FOLDER_NAME, BEDSTAT_OUTPUT) try: os.makedirs(outfolder_stats) @@ -211,6 +264,11 @@ def bedstat( } ) + # For now, add all the *other* attributes to other_metadata + for key, value in pep_sample_dict.items(): + if key not in list(other_metadata.keys()): + other_metadata.update({key: value}) + # unlist the data, since the output of regionstat.R is a dict of lists of # length 1 and force keys to lower to correspond with the # postgres column identifiers @@ -295,6 +353,15 @@ def bedstat( force_overwrite=True, ) + if upload_pephub: + _LOGGER.info("UPLOADING TO PEPHUB...") + load_to_pephub( + pep_registry_path=BED_PEP_REGISTRY, + bed_digest=bed_digest, + genome=genome, + metadata=other_metadata, + ) + if stop_pipeline: pm.stop_pipeline() return bed_digest diff --git a/bedboss/cli.py b/bedboss/cli.py index 2d161ef..116f57f 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -166,6 +166,11 @@ def build_argparser() -> ArgumentParser: action="store_true", help="whether to skip qdrant indexing", ) + sub_all.add_argument( + "--upload-pephub", + action="store_true", + help="upload to pephub", + ) # all-pep sub_all_pep.add_argument( @@ -245,6 +250,11 @@ def build_argparser() -> ArgumentParser: "Before uploading you have to set up all necessury env vars: " "AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_ENDPOINT_URL. [Default: False]", ) + sub_all_pep.add_argument( + "--upload-pephub", + action="store_true", + help="upload to pephub", + ) # bed_qc sub_qc.add_argument(