diff --git a/bedboss/__init__.py b/bedboss/__init__.py index 009b3fb..c87ae9a 100644 --- a/bedboss/__init__.py +++ b/bedboss/__init__.py @@ -71,3 +71,10 @@ datefmt="%H:%M:%S", fmt="[%(levelname)s] [%(asctime)s] [BBCONF] %(message)s", ) + +_LOGGER_BBCONF = logging.getLogger("pephubclient") +coloredlogs.install( + logger=_LOGGER_BBCONF, + datefmt="%H:%M:%S", + fmt="[%(levelname)s] [%(asctime)s] [PEPHUBCLIENT] %(message)s", +) diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index 2537c4f..c7033ea 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -7,9 +7,12 @@ import logmuse import peppy from eido import validate_project +import bbconf + import pephubclient +from pephubclient import PEPHubClient from pephubclient.helpers import is_registry_path -import bbconf +from ubiquerg import parse_registry_path from bedboss.bedstat.bedstat import bedstat from bedboss.bedmaker.bedmaker import make_all @@ -21,7 +24,11 @@ BED_FOLDER_NAME, BIGBED_FOLDER_NAME, BEDBOSS_PEP_SCHEMA_PATH, + OUTPUT_FOLDER_NAME, + BEDSTAT_OUTPUT, + BED_PEP_REGISTRY, ) +from bedboss.models import BedMetadata from bedboss.utils import ( extract_file_name, standardize_genome_name, @@ -33,6 +40,79 @@ _LOGGER = logging.getLogger("bedboss") +def load_to_pephub( + pep_registry_path: str, bed_digest: str, genome: str, metadata: dict +) -> None: + """ + Load bedfile and metadata to PEPHUB + + :param str pep_registry_path: registry path to pep on pephub + :param str bed_digest: unique bedfile identifier + :param str genome: genome associated with bedfile + :param dict metadata: Any other metadata that has been collected + + :return None + """ + + if is_registry_path(pep_registry_path): + parsed_pep_dict = parse_registry_path(pep_registry_path) + + # Combine data into a dict for sending to pephub + sample_data = {} + sample_data.update({"sample_name": bed_digest, "genome": genome}) + + metadata = BedMetadata(**metadata).model_dump() + + for key, value in metadata.items(): + # TODO: Confirm this key is in the schema + # Then update sample_data + sample_data.update({key: value}) + + try: + PEPHubClient().sample.create( + namespace=parsed_pep_dict["namespace"], + name=parsed_pep_dict["item"], + tag=parsed_pep_dict["tag"], + sample_name=bed_digest, + overwrite=True, + sample_dict=sample_data, + ) + + except Exception as e: # Need more specific exception + _LOGGER.error(f"Failed to upload BEDFILE to PEPhub: See {e}") + else: + _LOGGER.error(f"{pep_registry_path} is not a valid registry path") + + +def load_to_s3( + output_folder: str, + pm: pypiper.PipelineManager, + bed_file: str, + digest: str, + bigbed_file: str = None, +) -> None: + """ + Load bedfiles and statistics to s3 + + :param output_folder: base output folder + :param pm: pipelineManager object + :param bed_file: bedfile name + :param digest: bedfile digest + :param bigbed_file: bigbed file name + :return: NoReturn + """ + command = f"aws s3 cp {os.path.join(output_folder, bed_file)} s3://bedbase/{BED_FOLDER_NAME}" + _LOGGER.info("Uploading to s3 bed file") + pm.run(cmd=command, lock_name="s3_sync_bed") + if bigbed_file: + command = f"aws s3 cp {os.path.join(output_folder, bigbed_file)} s3://bedbase/{BIGBED_FOLDER_NAME}" + _LOGGER.info("Uploading to s3 bigbed file") + pm.run(cmd=command, lock_name="s3_sync_bigbed") + command = f"aws s3 sync {os.path.join(output_folder, OUTPUT_FOLDER_NAME,BEDSTAT_OUTPUT, digest)} s3://bedbase/{OUTPUT_FOLDER_NAME}/{BEDSTAT_OUTPUT}/{digest} --size-only" + _LOGGER.info("Uploading to s3 bed statistic files") + pm.run(cmd=command, lock_name="s3_sync_bedstat") + + def run_all( sample_name: str, input_file: str, @@ -49,9 +129,9 @@ def run_all( ensdb: str = None, other_metadata: dict = None, just_db_commit: bool = False, - no_db_commit: bool = False, + db_commit: bool = True, force_overwrite: bool = False, - skip_qdrant: bool = True, + upload_qdrant: bool = False, upload_s3: bool = False, upload_pephub: bool = False, pm: pypiper.PipelineManager = None, @@ -79,8 +159,9 @@ def run_all( (basically genomes that's not in GDdata) :param bool just_db_commit: whether just to commit the JSON to the database (default: False) :param bool force_overwrite: force overwrite analysis - :param bool no_db_commit: whether the JSON commit to the database should be skipped (default: False) - :param bool skip_qdrant: whether to skip qdrant indexing + + :param bool db_commit: whether the JSON commit to the database should be skipped (default: False) + :param bool upload_qdrant: whether to skip qdrant indexing :param bool upload_s3: whether to upload to s3 :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param pypiper.PipelineManager pm: pypiper object @@ -91,6 +172,9 @@ def run_all( if isinstance(bedbase_config, str): if not check_db_connection(bedbase_config=bedbase_config): raise BedBossException("Database connection failed. Exiting...") + bbc = bbconf.BedBaseConf(config_path=bedbase_config, database_only=True) + else: + bbc = bedbase_config file_name = extract_file_name(input_file) genome = standardize_genome_name(genome) @@ -135,27 +219,76 @@ def run_all( pm=pm, ) if not other_metadata: - other_metadata = classification_meta - else: - other_metadata.update(classification_meta) + other_metadata = {} + + bed_digest = classification_meta.get("digest") - bed_digest = bedstat( + statistics_dict = bedstat( bedfile=output_bed, outfolder=outfolder, - bedbase_config=bedbase_config, genome=genome, ensdb=ensdb, + bed_digest=bed_digest, open_signal_matrix=open_signal_matrix, bigbed=output_bigbed, - other_metadata=other_metadata, just_db_commit=just_db_commit, - no_db_commit=no_db_commit, - force_overwrite=force_overwrite, - skip_qdrant=skip_qdrant, - upload_s3=upload_s3, - upload_pephub=upload_pephub, pm=pm, ) + statistics_dict.update( + { + "bed_type": classification_meta["bed_type"], + "bed_format": classification_meta["bed_format"], + } + ) + + if db_commit: + bbc.bed.report( + record_identifier=bed_digest, + values=statistics_dict, + force_overwrite=force_overwrite, + ) + + if upload_s3: + _LOGGER.info(f"Uploading '{bed_digest}' data to S3 ...") + load_to_s3( + os.path.abspath(outfolder), pm, output_bed, bed_digest, output_bigbed + ) + else: + _LOGGER.info( + f"Skipping uploading '{bed_digest}' data to S3. 'upload_s3' is set to False. " + ) + + if upload_qdrant: + _LOGGER.info(f"Adding '{bed_digest}' vector to Qdrant ...") + + bbc.add_bed_to_qdrant( + bed_id=bed_digest, + bed_file=output_bed, + payload={"digest": bed_digest}, + ) + bbc.bed.report( + record_identifier=bed_digest, + values={"added_to_qdrant": True}, + force_overwrite=True, + ) + else: + _LOGGER.info( + f"Skipping adding '{bed_digest}' vector to Qdrant, 'skip_qdrant' is set to True. " + ) + + if upload_pephub: + _LOGGER.info(f"Uploading metadata of '{bed_digest}' TO PEPhub ...") + load_to_pephub( + pep_registry_path=BED_PEP_REGISTRY, + bed_digest=bed_digest, + genome=genome, + metadata=other_metadata, + ) + else: + _LOGGER.info( + f"Metadata of '{bed_digest}' is NOT uploaded to PEPhub. 'upload_pephub' is set to False. " + ) + if stop_pipeline: pm.stop_pipeline() diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py index 497fee2..d16c77d 100755 --- a/bedboss/bedmaker/bedmaker.py +++ b/bedboss/bedmaker/bedmaker.py @@ -18,6 +18,7 @@ from refgenconf.exceptions import MissingGenomeError from yacman.exceptions import UndefinedAliasError from ubiquerg import is_command_callable +from geniml.io import RegionSet from bedboss.bedclassifier.bedclassifier import get_bed_type from bedboss.bedqc.bedqc import bedqc @@ -180,10 +181,10 @@ def make(self) -> dict: # converting to bed.gz if needed self.make_bed() try: - bed_type, format_type = get_bed_type(self.input_file) + bed_type, bed_format = get_bed_type(self.input_file) except Exception: # we need this exception to catch the case when the input file is not a bed file - bed_type, format_type = get_bed_type(self.output_bed) + bed_type, bed_format = get_bed_type(self.output_bed) if self.check_qc: bedqc( self.output_bed, @@ -195,8 +196,9 @@ def make(self) -> dict: return { "bed_type": bed_type, - "file_type": format_type, + "bed_format": bed_format, "genome": self.genome, + "digest": RegionSet(self.output_bed).identifier, } def make_bed(self) -> None: @@ -549,7 +551,13 @@ def make_all( ChrUn chromosomes :param check_qc: run quality control during bedmaking :param pm: pypiper object - :return: dict with bed classificator results + :return: dict with generated bed metadata: + { + "bed_type": bed_type. e.g. bed, bigbed + "bed_format": bed_format. e.g. narrowpeak, broadpeak + "genome": genome of the sample, + "digest": bedfile identifier, + } """ return BedMaker( input_file=input_file, diff --git a/bedboss/bedstat/bedstat.py b/bedboss/bedstat/bedstat.py index 1a4dbfb..265e215 100755 --- a/bedboss/bedstat/bedstat.py +++ b/bedboss/bedstat/bedstat.py @@ -3,17 +3,12 @@ import os import requests import pypiper -import bbconf import logging from geniml.io import RegionSet -from pephubclient import PEPHubClient -from pephubclient.helpers import is_registry_path -from ubiquerg import parse_registry_path + from bedboss.const import ( OUTPUT_FOLDER_NAME, - BED_FOLDER_NAME, - BIGBED_FOLDER_NAME, BEDSTAT_OUTPUT, OS_HG19, OS_HG38, @@ -24,7 +19,6 @@ ) from bedboss.utils import download_file, convert_unit from bedboss.exceptions import OpenSignalMatrixException -from bedboss.models import BedMetadata _LOGGER = logging.getLogger("bedboss") @@ -33,81 +27,6 @@ os.path.dirname(os.path.realpath(__file__)), "pep_schema.yaml" ) -BED_PEP_REGISTRY = "databio/allbeds:bedbase" - - -def load_to_pephub( - pep_registry_path: str, bed_digest: str, genome: str, metadata: dict -) -> None: - """ - Load bedfile and metadata to PEPHUB - - :param str pep_registry_path: registry path to pep on pephub - :param str bed_digest: unique bedfile identifier - :param str genome: genome associated with bedfile - :param dict metadata: Any other metadata that has been collected - - :return None - """ - - if is_registry_path(pep_registry_path): - parsed_pep_dict = parse_registry_path(pep_registry_path) - - # Combine data into a dict for sending to pephub - sample_data = {} - sample_data.update({"sample_name": bed_digest, "genome": genome}) - - metadata = BedMetadata(**metadata).model_dump() - - for key, value in metadata.items(): - # TODO: Confirm this key is in the schema - # Then update sample_data - sample_data.update({key: value}) - - try: - PEPHubClient().sample.create( - namespace=parsed_pep_dict["namespace"], - name=parsed_pep_dict["item"], - tag=parsed_pep_dict["tag"], - sample_name=bed_digest, - overwrite=True, - sample_dict=sample_data, - ) - - except Exception as e: # Need more specific exception - _LOGGER.error(f"Failed to upload BEDFILE to PEPhub: See {e}") - else: - _LOGGER.error(f"{pep_registry_path} is not a valid registry path") - - -def load_to_s3( - output_folder: str, - pm: pypiper.PipelineManager, - bed_file: str, - digest: str, - bigbed_file: str = None, -) -> None: - """ - Load bedfiles and statistics to s3 - - :param output_folder: base output folder - :param pm: pipelineManager object - :param bed_file: bedfile name - :param digest: bedfile digest - :param bigbed_file: bigbed file name - :return: NoReturn - """ - command = f"aws s3 cp {os.path.join(output_folder, bed_file)} s3://bedbase/{BED_FOLDER_NAME}" - _LOGGER.info("Uploading to s3 bed file") - pm.run(cmd=command, lock_name="s3_sync_bed") - if bigbed_file: - command = f"aws s3 cp {os.path.join(output_folder, bigbed_file)} s3://bedbase/{BIGBED_FOLDER_NAME}" - _LOGGER.info("Uploading to s3 bigbed file") - pm.run(cmd=command, lock_name="s3_sync_bigbed") - command = f"aws s3 sync {os.path.join(output_folder, OUTPUT_FOLDER_NAME,BEDSTAT_OUTPUT, digest)} s3://bedbase/{OUTPUT_FOLDER_NAME}/{BEDSTAT_OUTPUT}/{digest} --size-only" - _LOGGER.info("Uploading to s3 bed statistic files") - pm.run(cmd=command, lock_name="s3_sync_bedstat") - def get_osm_path(genome: str, out_path: str = None) -> Union[str, None]: """ @@ -147,22 +66,16 @@ def get_osm_path(genome: str, out_path: str = None) -> Union[str, None]: def bedstat( bedfile: str, - bedbase_config: Union[str, bbconf.BedBaseConf], genome: str, outfolder: str, + bed_digest: str = None, + bigbed: str = None, ensdb: str = None, open_signal_matrix: str = None, - bigbed: str = None, - other_metadata: dict = None, just_db_commit: bool = False, - no_db_commit: bool = False, - force_overwrite: bool = False, - skip_qdrant: bool = True, - upload_s3: bool = False, - upload_pephub: bool = False, pm: pypiper.PipelineManager = None, **kwargs, -) -> str: +) -> dict: """ Run bedstat pipeline - pipeline for obtaining statistics about bed files and inserting them into the database @@ -171,24 +84,16 @@ def bedstat( :param str bigbed: the full path to the bigbed file. Defaults to None. (bigbed won't be created and some producing of some statistics will be skipped.) - :param str bedbase_config: The path to the bedbase configuration file, or bbconf object + :param str bed_digest: the digest of the bed file. Defaults to None. :param str open_signal_matrix: a full path to the openSignalMatrix required for the tissue specificity plots :param str outfolder: The folder for storing the pipeline results. :param str genome: genome assembly of the sample :param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata - :param dict other_metadata: a dictionary of other metadata to pass - :param bool just_db_commit: whether just to commit the JSON to the database - :param bool no_db_commit: whether the JSON commit to the database should be - skipped - :param skip_qdrant: whether to skip qdrant indexing [Default: True] - :param bool force_overwrite: whether to overwrite the existing record - :param upload_s3: whether to upload the bed file to s3 - :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param pm: pypiper object - :return: bed_digest: the digest of the bed file + :return: dict with statistics and plots metadata """ # TODO why are we no longer using bbconf to get the output path? # outfolder_stats = bbc.get_bedstat_output_path() @@ -199,12 +104,6 @@ def bedstat( except FileExistsError: pass - # if bbconf is a string, create a bbconf object - if isinstance(bedbase_config, str): - bbc = bbconf.BedBaseConf(config_path=bedbase_config, database_only=True) - else: - bbc = bedbase_config - # find/download open signal matrix if not open_signal_matrix or not os.path.exists(open_signal_matrix): try: @@ -221,7 +120,8 @@ def bedstat( else: stop_pipeline = False - bed_digest = RegionSet(bedfile).identifier + if not bed_digest: + bed_digest = RegionSet(bedfile).identifier bedfile_name = os.path.split(bedfile)[1] fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0] @@ -277,132 +177,79 @@ def bedstat( pm.run(cmd=command, target=json_file_path) - # commit to the database if no_db_commit is not set - if not no_db_commit: - data = {} - if os.path.exists(json_file_path): - with open(json_file_path, "r", encoding="utf-8") as f: - data = json.loads(f.read()) - if os.path.exists(json_plots_file_path): - with open(json_plots_file_path, "r", encoding="utf-8") as f_plots: - plots = json.loads(f_plots.read()) - else: - plots = [] - - if not other_metadata: - other_metadata = {} + data = {} + if os.path.exists(json_file_path): + with open(json_file_path, "r", encoding="utf-8") as f: + data = json.loads(f.read()) + if os.path.exists(json_plots_file_path): + with open(json_plots_file_path, "r", encoding="utf-8") as f_plots: + plots = json.loads(f_plots.read()) + else: + plots = [] + + # unlist the data, since the output of regionstat.R is a dict of lists of + # length 1 and force keys to lower to correspond with the + # postgres column identifiers + data = {k.lower(): v[0] if isinstance(v, list) else v for k, v in data.items()} + data.update( + { + "bedfile": { + "path": bed_relpath, + "size": convert_unit(os.path.getsize(bedfile)), + "title": "Path to the BED file", + } + } + ) - # unlist the data, since the output of regionstat.R is a dict of lists of - # length 1 and force keys to lower to correspond with the - # postgres column identifiers - data = {k.lower(): v[0] if isinstance(v, list) else v for k, v in data.items()} + if os.path.exists(os.path.join(bigbed, fileid + ".bigBed")): data.update( { - "bedfile": { - "path": bed_relpath, - "size": convert_unit(os.path.getsize(bedfile)), - "title": "Path to the BED file", + "bigbedfile": { + "path": bigbed_relpath, + "size": convert_unit( + os.path.getsize(os.path.join(bigbed, fileid + ".bigBed")) + ), + "title": "Path to the big BED file", } } ) - if os.path.exists(os.path.join(bigbed, fileid + ".bigBed")): - data.update( - { - "bigbedfile": { - "path": bigbed_relpath, - "size": convert_unit( - os.path.getsize(os.path.join(bigbed, fileid + ".bigBed")) - ), - "title": "Path to the big BED file", - } - } - ) - - if not os.path.islink(os.path.join(bigbed, fileid + ".bigBed")): - digest = requests.get( - f"http://refgenomes.databio.org/genomes/genome_digest/{genome}" - ).text.strip('""') + if not os.path.islink(os.path.join(bigbed, fileid + ".bigBed")): + digest = requests.get( + f"http://refgenomes.databio.org/genomes/genome_digest/{genome}" + ).text.strip('""') - data.update( - { - "genome": { - "alias": genome, - "digest": digest, - } - } - ) - else: data.update( { "genome": { "alias": genome, - "digest": "", + "digest": digest, } } ) - - for plot in plots: - plot_id = plot["name"] - del plot["name"] - data.update({plot_id: plot}) - - # deleting md5sum, because it is record_identifier - del data["md5sum"] - - # add added_to_qdrant to the data - data["added_to_qdrant"] = False - - # add other to dict in bb database (now we are using pephub for this purpose) - # data["other"] = other_metadata - - bbc.bed.report( - record_identifier=bed_digest, - values=data, - force_overwrite=force_overwrite, - ) - - if upload_s3: - _LOGGER.info(f"Uploading '{bed_digest}' data to S3 ...") - load_to_s3( - os.path.abspath(outfolder), pm, bed_relpath, bed_digest, bigbed_relpath - ) else: - _LOGGER.info( - f"Skipping uploading '{bed_digest}' data to S3. 'upload_s3' is set to False. " + data.update( + { + "genome": { + "alias": genome, + "digest": "", + } + } ) - if not skip_qdrant: - _LOGGER.info(f"Adding '{bed_digest}' vector to Qdrant ...") + for plot in plots: + plot_id = plot["name"] + del plot["name"] + data.update({plot_id: plot}) - bbc.add_bed_to_qdrant( - bed_id=bed_digest, - bed_file=bedfile, - payload={"fileid": fileid}, - ) - bbc.bed.report( - record_identifier=bed_digest, - values={"added_to_qdrant": True}, - force_overwrite=True, - ) - else: - _LOGGER.info( - f"Skipping adding '{bed_digest}' vector to Qdrant, 'skip_qdrant' is set to True. " - ) + # deleting md5sum, because it is record_identifier + if "md5sum" in data: + del data["md5sum"] - if upload_pephub: - _LOGGER.info(f"Uploading metadata of '{bed_digest}' TO PEPhub ...") - load_to_pephub( - pep_registry_path=BED_PEP_REGISTRY, - bed_digest=bed_digest, - genome=genome, - metadata=other_metadata, - ) - else: - _LOGGER.info( - f"Metadata of '{bed_digest}' is NOT uploaded to PEPhub. 'upload_pephub' is set to False. " - ) + # add added_to_qdrant to the data + data["added_to_qdrant"] = False if stop_pipeline: pm.stop_pipeline() - return bed_digest + + return data diff --git a/bedboss/bedstat/models.py b/bedboss/bedstat/models.py deleted file mode 100644 index d9d50a0..0000000 --- a/bedboss/bedstat/models.py +++ /dev/null @@ -1,9 +0,0 @@ -from pydantic import BaseModel, ConfigDict, Field - - -class BEDSTAT_RETURN(BaseModel): - """ - Model of single namespace search result - """ - - ... diff --git a/bedboss/bedstat/pep_schema.yaml b/bedboss/bedstat/pep_schema.yaml deleted file mode 100644 index 65bc588..0000000 --- a/bedboss/bedstat/pep_schema.yaml +++ /dev/null @@ -1,79 +0,0 @@ -description: bedstat PEP schema - -properties: - samples: - type: array - items: - type: object - properties: - sample_name: - type: string - db_commit: TRUE - description: "name of the sample, which is the name of the output BED file" - input_file_path: - type: string - db_commit: FALSE - description: "absolute path the file to convert" - output_file_path: - type: string - db_commit: FALSE - description: "absolute path the file to the output BED file (derived attribute)" - bigbed: - type: string - db_commit: FALSE - description: "dir path where the bigbed file stored (derived attribute)" - genome: - type: string - db_commit: TRUE - description: "organism genome code" - narrowpeak: - type: boolean - db_commit: TRUE - description: "binary number indicating whether the regions are narrow (transcription factor implies narrow, histone mark implies broad peaks)" - format: - type: string - db_commit: TRUE - description: "file format" - enum: ["bigWig", "bigBed", "bed", "wig", "bedGraph"] - cell_type: - type: string - db_commit: TRUE - description: "cell type code" - antibody: - type: string - db_commit: TRUE - description: "antibody used if ChIP-seq experiment" - description: - type: string - db_commit: TRUE - description: "freeform description of the sample" - exp_protocol: - type: string - db_commit: TRUE - description: "type of the experiment the file was generated in" - data_source: - type: string - db_commit: TRUE - description: "source of the sample, preferably a GSE* code" - treatment: - type: string - db_commit: TRUE - description: "freeform description of the sample treatment" - ensdb: - type: string - db_commit: FALSE - description: "path of gtf annotation for genomes not in GDdata" - fasta: - type: string - db_commit: FALSE - description: "path of for genomes not in GDdata" - open_signal_matrix: - type: string - db_commit: FALSE - description: "path of for the open signal matrixm file for the given genome" - required: - - output_file_path - - genome - - sample_name -required: - - samples \ No newline at end of file diff --git a/bedboss/const.py b/bedboss/const.py index 3cd415c..2644bcb 100644 --- a/bedboss/const.py +++ b/bedboss/const.py @@ -61,3 +61,5 @@ BEDBOSS_PEP_SCHEMA_PATH = "https://schema.databio.org/pipelines/bedboss.yaml" REFGENIE_ENV_VAR = "REFGENIE" + +BED_PEP_REGISTRY = "databio/allbeds:bedbase" diff --git a/bedboss/models.py b/bedboss/models.py index 534a681..eba5407 100644 --- a/bedboss/models.py +++ b/bedboss/models.py @@ -12,16 +12,30 @@ class FILE_TYPE(str, Enum): class BedMetadata(BaseModel): sample_name: str genome: str - format_type: FILE_TYPE = FILE_TYPE.BED - bed_type: str = Field( - default="bed3", pattern="^bed(?:[3-9]|1[0-5])(?:\+|$)[0-9]?+$" + organism: str = "" + species_id: str = "" + cell_type: str = "" + cell_line: str = "" + exp_protocol: str = Field("", description="Experimental protocol (e.g. ChIP-seq)") + library_source: str = Field( + "", description="Library source (e.g. genomic, transcriptomic)" ) - description: str = None - organism: str = None - cell_type: str = None - tissue: str = None - antibody: str = None - sample_library_strategy: str = None + genotype: str = Field("", description="Genotype of the sample") + target: str = Field("", description="Target of the assay (e.g. H3K4me3)") + antibody: str = Field("", description="Antibody used in the assay") + treatment: str = Field( + "", description="Treatment of the sample (e.g. drug treatment)" + ) + tissue: str = Field("", description="Tissue type") + global_sample_id: str = Field("", description="Global sample identifier") + global_experiment_id: str = Field("", description="Global experiment identifier") + description: str = Field("", description="Description of the sample") + + # THIS IS NOW PART OF THE BedBase model in bbconf + # bed_format: FILE_TYPE = FILE_TYPE.BED + # bed_type: str = Field( + # default="bed3", pattern="^bed(?:[3-9]|1[0-5])(?:\+|$)[0-9]?+$" + # ) model_config = ConfigDict( populate_by_name=True, diff --git a/test/test_bedboss.py b/test/test_bedboss.py index d17359d..25b2879 100644 --- a/test/test_bedboss.py +++ b/test/test_bedboss.py @@ -122,11 +122,8 @@ def test_stat(self, bedfile, bigbed_file, genome, output_temp_dir): "command": "stat", "bedfile": bedfile, "outfolder": output_temp_dir, - "bedbase_config": BEDBASE_CONFIG, "genome": genome, "bigbed": bigbed_file, - "no_db_commit": True, - "skip_qdrant": True, "multy": True, } ) diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index 5f0ba21..21f71d2 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -33,7 +33,7 @@ def test_classification(): def test_get_bed_type(): bedtype = get_bed_type(bed=FILE_PATH_UNZIPPED) - assert bedtype == ("bed6+3", "bed") + assert bedtype == ("bed6+3", "broadpeak") @pytest.mark.parametrize(