diff --git a/bedboss/__init__.py b/bedboss/__init__.py index d3cda80..22e102d 100644 --- a/bedboss/__init__.py +++ b/bedboss/__init__.py @@ -1,12 +1,16 @@ """ Package-level data """ +import logmuse +import coloredlogs + from bedboss import * from bedboss.bedqc import bedqc from bedboss.bedmaker import bedmaker from bedboss.bedstat import bedstat from bedboss._version import __version__ -import logmuse + __package_name__ = "bedboss" + __author__ = [ "Oleksandr Khoroshevskyi", "Michal Stolarczyk", @@ -25,4 +29,9 @@ "bedstat", ] -logmuse.init_logger(__version__) +_LOGGER = logmuse.init_logger("bedboss") +coloredlogs.install( + logger=_LOGGER, + datefmt="%H:%M:%S", + fmt="[%(levelname)s] [%(asctime)s] %(message)s", +) diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index da34994..f444f15 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -1,9 +1,12 @@ import logging import os from typing import NoReturn, Union, Dict + +import peppy import pypiper from argparse import Namespace import logmuse +import peppy from bedboss.bedstat.bedstat import bedstat from bedboss.bedmaker.bedmaker import BedMaker @@ -54,7 +57,7 @@ def get_osm_path(genome: str) -> Union[str, None]: if not os.path.exists(osm_path): if not os.path.exists(OPEN_SIGNAL_FOLDER): os.makedirs(OPEN_SIGNAL_FOLDER) - download_file(url=f"{OPEN_SIGNAL_URL}{osm_name}", path=osm_path) + download_file(url=f"{OPEN_SIGNAL_URL}{osm_name}", path=osm_path, no_fail=True,) return osm_path @@ -163,6 +166,43 @@ def run_all( ) +def run_all_by_pep(pep: Union[str, peppy.Project]) -> NoReturn: + """ + Run bedboss pipeline by providing pep config file. + + :param pep: path to the pep config file or peppy.Project object + """ + if isinstance(pep, str): + pep = peppy.Project(pep) + elif isinstance(pep, peppy.Project): + pass + else: + raise Exception("Incorrect pep type. Exiting...") + + for pep_sample in pep.samples: + _LOGGER.info(f"Running bedboss pipeline for {pep_sample.sample_name}") + run_all( + sample_name=pep_sample.sample_name, + input_file=pep_sample.input_file, + input_type=pep_sample.input_type, + outfolder=pep_sample.outfolder, + genome=pep_sample.genome, + bedbase_config=pep_sample.bedbase_config, + rfg_config=pep_sample.get("rfg_config"), + narrowpeak=pep_sample.get("narrowpeak"), + check_qc=pep_sample.get("check_qc"), + standard_chrom=pep_sample.get("standard_chrom"), + chrom_sizes=pep_sample.get("chrom_sizes"), + open_signal_matrix=pep_sample.get("open_signal_matrix"), + ensdb=pep_sample.get("ensdb"), + sample_yaml=pep_sample.get("sample_yaml"), + just_db_commit=pep_sample.get("just_db_commit"), + no_db_commit=pep_sample.get("no_db_commit"), + force_overwrite=pep_sample.get("force_overwrite"), + skip_qdrant=pep_sample.get("skip_qdrant"), + ) + + def main(test_args: dict = None) -> NoReturn: """ Run pipeline that was specified in as positional argument. @@ -196,6 +236,8 @@ def main(test_args: dict = None) -> NoReturn: bedqc(pm=pm, **args_dict) elif args_dict["command"] == "stat": bedstat(pm=pm, **args_dict) + elif args_dict["command"] == "all-pep": + run_all_by_pep(args_dict["pep_config"]) else: parser.print_help() # raise Exception("Incorrect pipeline name.") diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py index 5ff6966..f0e573e 100755 --- a/bedboss/bedmaker/bedmaker.py +++ b/bedboss/bedmaker/bedmaker.py @@ -84,14 +84,14 @@ def __init__( If true, filter the input file to contain only the standard chromosomes, remove regions on ChrUn chromosomes - :param check_qc: run quality control during badmaking + :param check_qc: run quality control during bedmaking :param pm: pypiper object :return: noReturn """ # Define file paths self.input_file = input_file - self.input_type = input_type + self.input_type = input_type.lower() self.output_bed = output_bed self.output_bigbed = output_bigbed self.file_name = os.path.basename(input_file) @@ -186,7 +186,7 @@ def make_bed(self) -> NoReturn: on input file type and execute the command. """ - _LOGGER.info(f"Converting {self.input_file} to BED format.") + _LOGGER.info(f"Converting {os.path.abspath(self.input_file)} to BED format.") temp_bed_path = os.path.splitext(self.output_bed)[0] # creat cmd to run that convert non bed file to bed file @@ -195,14 +195,14 @@ def make_bed(self) -> NoReturn: # Use the gzip and shutil modules to produce temporary unzipped files if self.input_extension == ".gz": - input_file = os.path.join( + temp_input_file = os.path.join( os.path.dirname(self.output_bed), os.path.splitext(self.file_name)[0], ) with gzip.open(self.input_file, "rb") as f_in: - with open(input_file, "wb") as f_out: + with open(temp_input_file, "wb") as f_out: shutil.copyfileobj(f_in, f_out) - self.pm.clean_add(input_file) + self.pm.clean_add(temp_input_file) # creating cmd for bedGraph files if self.input_type == "bedGraph": @@ -309,6 +309,7 @@ def make_bed(self) -> NoReturn: ), ] self.pm.run(cmd, target=self.output_bed) + self.pm._cleanup() def make_bigbed(self) -> NoReturn: """ @@ -337,7 +338,7 @@ def make_bigbed(self) -> NoReturn: "https://genome.ucsc.edu/goldenpath/help/bigBed.html" ) if bedtype is not None: - cmd = "zcat " + self.output_bed + " | sort -k1,1 -k2,2n > " + temp + cmd = f"zcat {self.output_bed} | sort -k1,1 -k2,2n > {temp}" self.pm.run(cmd, temp) cmd = f"{BED_TO_BIGBED_PROGRAM} -type={bedtype} {temp} {self.chrom_sizes} {big_narrow_peak}" @@ -371,6 +372,7 @@ def make_bigbed(self) -> NoReturn: f"unable to validate genome assembly with Refgenie. " f"Error: {err}" ) + self.pm._cleanup() def get_rgc(self) -> str: """ diff --git a/bedboss/cli.py b/bedboss/cli.py index c1c18c3..ef97d3b 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -22,6 +22,10 @@ def build_argparser() -> ArgumentParser: sub_all = subparser.add_parser( "all", help="Run all bedboss pipelines and insert data into bedbase" ) + sub_all_pep = subparser.add_parser( + "all-pep", + help="Run all bedboss pipelines using one PEP and insert data into bedbase", + ) sub_make = subparser.add_parser( "make", help="A pipeline to convert bed, bigbed, bigwig or bedgraph " @@ -136,6 +140,21 @@ def build_argparser() -> ArgumentParser: help="just commit the JSON to the database", ) + # all-pep + sub_all_pep.add_argument( + "--pep_config", + dest="pep_config", + required=True, + help="Path to the pep configuration file [Required]\n " + "Required fields in PEP are: " + "sample_name, input_file, input_type,outfolder, genome, bedbase_config.\n " + "Optional fields in PEP are: " + "rfg_config, narrowpeak, check_qc, standard_chrom, chrom_sizes, " + "open_signal_matrix, ensdb, sample_yaml, no_db_commit, just_db_commit, " + "no_db_commit, force_overwrite, skip_qdrant", + type=str, + ) + # bed_qc sub_qc.add_argument( "--bedfile", diff --git a/bedboss/utils.py b/bedboss/utils.py index ba664dd..fab4694 100644 --- a/bedboss/utils.py +++ b/bedboss/utils.py @@ -1,6 +1,5 @@ import os import logging -import requests import urllib from bbconf import BedBaseConf from typing import NoReturn @@ -43,22 +42,25 @@ def standardize_genome_name(input_genome: str) -> str: return input_genome -def download_file(url: str, path: str) -> NoReturn: +def download_file(url: str, path: str, no_fail: bool = False) -> NoReturn: """ Download file from the url to specific location :param url: URL of the file :param path: Local path with filename + :param no_fail: If True, do not raise exception if download fails :return: NoReturn """ _LOGGER.info(f"Downloading remote file: {url}") - _LOGGER.info(f"Local path: {path}") + _LOGGER.info(f"Local path: {os.path.abspath(path)}") try: urllib.request.urlretrieve(url, path) _LOGGER.info(f"File downloaded successfully!") except Exception as e: _LOGGER.error(f"File download failed.") - raise e + if not no_fail: + raise e + _LOGGER.error(f"File download failed. Continuing anyway...") def check_db_connection(bedbase_config: str) -> bool: diff --git a/docs/templates/usage.template b/docs/templates/usage.template new file mode 100644 index 0000000..582cef7 --- /dev/null +++ b/docs/templates/usage.template @@ -0,0 +1,16 @@ +# Usage reference + +BEDboss is command-line tool-warehouse of 3 pipelines for genomic interval files + +BEDboss include: bedmaker, bedqc, bedstat. This pipelines can be run using next positional arguments: + +- `bedbase all`: Runs all pipelines one in order: bedmaker -> bedqc -> bedstat + +- `bedbase make`: Creates Bed and BigBed files from other type of genomic interval files [bigwig|bedgraph|bed|bigbed|wig] + +- `bedbase qc`: Runs Quality control for bed file (Works only with bed files) + +- `bedbase stat`: Runs statistics for bed and bigbed files. + +Here you can see the command-line usage instructions for the main bedboss command and for each subcommand: + diff --git a/docs/usage.md b/docs/usage.md index a457e59..ede3f99 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -16,57 +16,60 @@ Here you can see the command-line usage instructions for the main bedboss comman ## `bedboss --help` ```console -version: 0.1.0 -usage: bedboss [-h] [--version] {all,make,qc,stat} ... +version: 0.1.0a3 +usage: bedboss [-h] [--version] [--silent] [--verbosity V] [--logdev] + {all,all-pep,make,qc,stat} ... Warehouse of pipelines for BED-like files: bedmaker, bedstat, and bedqc. positional arguments: - {all,make,qc,stat} - all Run all bedboss pipelines and insert data into bedbase - make A pipeline to convert bed, bigbed, bigwig or bedgraph - files into bed and bigbed formats - qc Run quality control on bed file (bedqc) - stat A pipeline to read a file in BED format and produce - metadata in JSON format. + {all,all-pep,make,qc,stat} + all Run all bedboss pipelines and insert data into bedbase + all-pep Run all bedboss pipelines using one PEP and insert + data into bedbase + make A pipeline to convert bed, bigbed, bigwig or bedgraph + files into bed and bigbed formats + qc Run quality control on bed file (bedqc) + stat A pipeline to read a file in BED format and produce + metadata in JSON format. options: - -h, --help show this help message and exit - --version show program's version number and exit + -h, --help show this help message and exit + --version show program's version number and exit + --silent Silence logging. Overrides verbosity. + --verbosity V Set logging level (1-5 or logging module level name) + --logdev Expand content of logging message format. ``` ## `bedboss all --help` ```console -usage: bedboss all [-h] -s SAMPLE_NAME -f INPUT_FILE -t INPUT_TYPE -o - OUTPUT_FOLDER -g GENOME [-r RFG_CONFIG] - [--chrom-sizes CHROM_SIZES] [-n NARROWPEAK] - [--standard-chrom] [--check-qc] - [--open-signal-matrix OPEN_SIGNAL_MATRIX] [--ensdb ENSDB] - --bedbase-config BEDBASE_CONFIG [-y SAMPLE_YAML] - [--no-db-commit] [--just-db-commit] +usage: bedboss all [-h] --outfolder OUTFOLDER -s SAMPLE_NAME -f INPUT_FILE -t + INPUT_TYPE -g GENOME [-r RFG_CONFIG] + [--chrom-sizes CHROM_SIZES] [-n] [--standard-chrom] + [--check-qc] [--open-signal-matrix OPEN_SIGNAL_MATRIX] + [--ensdb ENSDB] --bedbase-config BEDBASE_CONFIG + [-y SAMPLE_YAML] [--no-db-commit] [--just-db-commit] options: -h, --help show this help message and exit + --outfolder OUTFOLDER + Pipeline output folder [Required] -s SAMPLE_NAME, --sample-name SAMPLE_NAME name of the sample used to systematically build the - output name + output name [Required] -f INPUT_FILE, --input-file INPUT_FILE - Input file + Input file [Required] -t INPUT_TYPE, --input-type INPUT_TYPE - Input type [required] options: + Input type [Required] options: (bigwig|bedgraph|bed|bigbed|wig) - -o OUTPUT_FOLDER, --output_folder OUTPUT_FOLDER - Output folder -g GENOME, --genome GENOME - reference genome (assembly) + reference genome (assembly) [Required] -r RFG_CONFIG, --rfg-config RFG_CONFIG file path to the genome config file(refgenie) --chrom-sizes CHROM_SIZES a full path to the chrom.sizes required for the bedtobigbed conversion - -n NARROWPEAK, --narrowpeak NARROWPEAK - whether the regions are narrow (transcription factor - implies narrow, histone mark implies broad peaks) + -n, --narrowpeak whether it's a narrowpeak file --standard-chrom Standardize chromosome names. Default: False --check-qc Check quality control before processing data. Default: True @@ -76,7 +79,7 @@ options: --ensdb ENSDB A full path to the ensdb gtf file required for genomes not in GDdata --bedbase-config BEDBASE_CONFIG - a path to the bedbase configuration file + a path to the bedbase configuration file [Required] -y SAMPLE_YAML, --sample-yaml SAMPLE_YAML a yaml config file with sample attributes to pass on more metadata into the database @@ -84,37 +87,54 @@ options: --just-db-commit just commit the JSON to the database ``` +## `bedboss all-pep --help` +```console +usage: bedboss all-pep [-h] --pep_config PEP_CONFIG + +options: + -h, --help show this help message and exit + --pep_config PEP_CONFIG + Path to the pep configuration file [Required] Required + fields in PEP are: sample_name, input_file, + input_type,outfolder, genome, bedbase_config. Optional + fields in PEP are: rfg_config, narrowpeak, check_qc, + standard_chrom, chrom_sizes, open_signal_matrix, + ensdb, sample_yaml, no_db_commit, just_db_commit, + no_db_commit, force_overwrite, skip_qdrant +``` + ## `bedboss make --help` ```console -usage: bedboss make [-h] -f INPUT_FILE [-n NARROWPEAK] -t INPUT_TYPE -g GENOME - -r RFG_CONFIG -o OUTPUT_BED --output-bigbed OUTPUT_BIGBED - -s SAMPLE_NAME [--chrom-sizes CHROM_SIZES] - [--standard-chrom] +usage: bedboss make [-h] -f INPUT_FILE --outfolder OUTFOLDER [-n] -t + INPUT_TYPE -g GENOME [-r RFG_CONFIG] -o OUTPUT_BED + --output-bigbed OUTPUT_BIGBED -s SAMPLE_NAME + [--chrom-sizes CHROM_SIZES] [--standard-chrom] options: -h, --help show this help message and exit -f INPUT_FILE, --input-file INPUT_FILE - path to the input file - -n NARROWPEAK, --narrowpeak NARROWPEAK - whether the regions are narrow (transcription factor - implies narrow, histone mark implies broad peaks) + path to the input file [Required] + --outfolder OUTFOLDER + Pipeline output folder [Required] + -n, --narrowpeak whether it's a narrowpeak file -t INPUT_TYPE, --input-type INPUT_TYPE - a bigwig or a bedgraph file that will be converted - into BED format + input file format (supported formats: bedGraph, + bigBed, bigWig, wig) [Required] -g GENOME, --genome GENOME - reference genome + reference genome [Required] -r RFG_CONFIG, --rfg-config RFG_CONFIG file path to the genome config file -o OUTPUT_BED, --output-bed OUTPUT_BED - path to the output BED files + path to the output BED files [Required] --output-bigbed OUTPUT_BIGBED - path to the folder of output bigBed files + path to the folder of output bigBed files [Required] -s SAMPLE_NAME, --sample-name SAMPLE_NAME name of the sample used to systematically build the - output name + output name [Required] --chrom-sizes CHROM_SIZES - a full path to the chrom.sizes required for the - bedtobigbed conversion + whether standardize chromosome names. If ture, + bedmaker will remove the regions on ChrUn chromosomes, + such as chrN_random and chrUn_random. [Default: False] --standard-chrom Standardize chromosome names. Default: False ``` @@ -124,22 +144,24 @@ usage: bedboss qc [-h] --bedfile BEDFILE --outfolder OUTFOLDER options: -h, --help show this help message and exit - --bedfile BEDFILE a full path to bed file to process + --bedfile BEDFILE a full path to bed file to process [Required] --outfolder OUTFOLDER - a full path to output log folder. + a full path to output log folder. [Required] ``` ## `bedboss stat --help` ```console -usage: bedboss stat [-h] --bedfile BEDFILE +usage: bedboss stat [-h] --bedfile BEDFILE --outfolder OUTFOLDER [--open-signal-matrix OPEN_SIGNAL_MATRIX] [--ensdb ENSDB] - [--bigbed BIGBED] [--bedbase-config BEDBASE_CONFIG] - [-y SAMPLE_YAML] --genome GENOME_ASSEMBLY [--no-db-commit] + [--bigbed BIGBED] --bedbase-config BEDBASE_CONFIG + [-y SAMPLE_YAML] --genome GENOME [--no-db-commit] [--just-db-commit] options: -h, --help show this help message and exit - --bedfile BEDFILE a full path to bed file to process + --bedfile BEDFILE a full path to bed file to process [Required] + --outfolder OUTFOLDER + Pipeline output folder [Required] --open-signal-matrix OPEN_SIGNAL_MATRIX a full path to the openSignalMatrix required for the tissue specificity plots @@ -147,14 +169,12 @@ options: not in GDdata --bigbed BIGBED a full path to the bigbed files --bedbase-config BEDBASE_CONFIG - a path to the bedbase configuration file + a path to the bedbase configuration file [Required] -y SAMPLE_YAML, --sample-yaml SAMPLE_YAML a yaml config file with sample attributes to pass on more metadata into the database - --genome GENOME_ASSEMBLY - genome assembly of the sample + --genome GENOME genome assembly of the sample [Required] --no-db-commit whether the JSON commit to the database should be skipped --just-db-commit whether just to commit the JSON to the database ``` - diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 9d5ad48..31c63db 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,6 @@ logmuse>=0.2.7 -peppy>=0.35.7 +coloredlogs>=15.0.1 +peppy>=0.40.0a4 yacman>=0.8.4 requests>=2.28.2 piper>=0.13.2 diff --git a/update_usage_docs.sh b/update_usage_docs.sh index 5ecc7de..9faaa3a 100755 --- a/update_usage_docs.sh +++ b/update_usage_docs.sh @@ -2,7 +2,7 @@ cp docs/templates/usage.template usage.template # bedboss --help > USAGE.temp 2>&1 -for cmd in "--help" "all --help" "make --help" "qc --help" "stat --help"; do +for cmd in "--help" "all --help" "all-pep --help" "make --help" "qc --help" "stat --help"; do echo $cmd echo -e "## \`bedboss $cmd\`" > USAGE_header.temp bedboss $cmd --help > USAGE.temp 2>&1