Merge pull request #2 from databio/dev

v0.1.0a1
databio · Aug 2, 2023 · 81d22fe · 81d22fe
2 parents a367762 + c6a6993
commit 81d22fe
Show file tree

Hide file tree

Showing 59 changed files with 6,168 additions and 1 deletion.
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
@@ -0,0 +1,11 @@
+name: Lint
+
+on: [pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+      - uses: psf/black@stable
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -0,0 +1,31 @@
+# This workflows will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel twine
+    - name: Build and publish
+      env:
+        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+      run: |
+        python setup.py sdist bdist_wheel
+        twine upload dist/*
diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml
@@ -0,0 +1,36 @@
+## This code is commented, because tests need a lot of dependencies and they can be run only localy!
+# name: Run pytests
+#
+#on:
+#  push:
+#    branches: [dev]
+#  pull_request:
+#    branches: [master, dev]
+#
+#jobs:
+#  pytest:
+#    runs-on: ${{ matrix.os }}
+#    strategy:
+#      matrix:
+#        python-version: ["3.8", "3.9", "3.10"]
+#        os: [ubuntu-latest]
+#
+#    steps:
+#    - uses: actions/checkout@v2
+#
+#    - name: Set up Python ${{ matrix.python-version }}
+#      uses: actions/setup-python@v2
+#      with:
+#        python-version: ${{ matrix.python-version }}
+#
+#    - name: Install dev dependencies
+#      run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi
+#
+#    - name: Install test dependencies
+#      run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi
+#
+#    - name: Install package
+#      run: python -m pip install .
+#
+#    - name: Run pytest tests
+#      run: pytest tests -x -vv
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,8 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+.idea
+
+
+bedqc/*
+test/bedqc/*
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,9 @@
+Copyright 2023 Nathan Sheffield
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,7 @@
+include requirements/*
+include README.md
+include bedboss/*
+include bedboss/bedstat/*
+include bedboss/bedstat/tools/*
+include bedboss/bedmaker/*
+include bedboss/bedqc/*
diff --git a/README.md b/README.md
@@ -1,2 +1,31 @@
 # bedboss
-Python package with bed file processing tools and pipelines for bedbase
+
+---
+![Run pytests](https://github.com/bedbase/bedboss/workflows/Run%20pytests/badge.svg)
+[![docs-badge](https://readthedocs.org/projects/bedboss/badge/?version=latest)](https://bedboss.databio.org/en/latest/)
+[![pypi-badge](https://img.shields.io/pypi/v/bedboss)](https://pypi.org/project/bedboss)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+
+bedboss is a command-line pipeline that standardizes and calculates statistics for genomic interval data, and enters the results into a BEDbase database. It has 3 components: 1) bedmaker (`bedboss make`); 2) bedqc (`bedboss qc`); and 3) bedstat `bedboss stat`. You may run all 3 pipelines separately, together (`bedbase all`).
+
+## 1) bedmaker
+
+Converts supported file types into BED and bigBed format. Currently supported formats:
+   - bedGraph
+   - bigBed
+   - bigWig
+   - wig
+
+## 2) bedqc
+
+Assess QC of BED files and flag potential problems for further evaluation so you can determine whether they should be included in downstream analysis. 
+Currently, it flags BED files that are larger than 2 GB, have over 5 milliom regions, or have mean region width less than 10 bp.
+These thresholds can be changed with pipeline arguments.
+
+## bedstat
+
+Calcualtes statistics about BED files.
+
+# Documentation
+
+Detailed information about each pipeline can be found in the [bedboss Readme](./docs/README.md).
diff --git a/bedboss/__init__.py b/bedboss/__init__.py
@@ -0,0 +1,28 @@
+""" Package-level data """
+from bedboss import *
+from bedboss.bedqc import bedqc
+from bedboss.bedmaker import bedmaker
+from bedboss.bedstat import bedstat
+import logmuse
+
+__version__ = "0.1.0a1"
+__package_name__ = "bedboss"
+__author__ = [
+    "Oleksandr Khoroshevskyi",
+    "Michal Stolarczyk",
+    "Ognen Duzlevski",
+    "Jose Verdezoto",
+    "Bingjie Xue",
+]
+__email__ = "[email protected]"
+
+__all__ = [
+    "__version__",
+    "__package_name__",
+    "__author__",
+    "bedqc",
+    "bedmaker",
+    "bedstat",
+]
+
+logmuse.init_logger(__version__)
diff --git a/bedboss/__main__.py b/bedboss/__main__.py
@@ -0,0 +1,15 @@
+import sys
+import logmuse
+
+from bedboss.bedboss import main
+
+
+_LOGGER = logmuse.init_logger("bedboss")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("Pipeline aborted.")
+        sys.exit(1)
diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py
@@ -0,0 +1,190 @@
+import logging
+import os
+import urllib.request
+from typing import NoReturn, Union, Dict
+import pypiper
+from argparse import Namespace
+import logmuse
+
+from bedboss.bedstat.bedstat import bedstat
+from bedboss.bedmaker.bedmaker import BedMaker
+from bedboss.bedqc.bedqc import bedqc
+from bedboss.cli import build_argparser
+
+from .const import (
+    OS_HG19,
+    OS_HG38,
+    OS_MM10,
+    OPEN_SIGNAL_FOLDER,
+    OPEN_SIGNAL_URL,
+    BED_FOLDER_NAME,
+    BIGBED_FOLDER_NAME,
+)
+from .utils import extract_file_name, standardize_genome_name, download_file
+from .exceptions import OpenSignalMatrixException
+from bedboss import __version__
+
+_LOGGER = logging.getLogger("bedboss")
+
+
+def get_osm_path(genome: str) -> Union[str, None]:
+    """
+    By providing genome name download Open Signal Matrix
+    :param genome: genome assembly
+    :return: path to the Open Signal Matrix
+    """
+    # TODO: add more osm
+    _LOGGER.info(f"Getting Open Signal Matrix file path...")
+    if genome == "hg19":
+        osm_name = OS_HG19
+    elif genome == "hg38":
+        osm_name = OS_HG38
+    elif genome == "mm10":
+        osm_name = OS_MM10
+    else:
+        raise OpenSignalMatrixException(
+            "For this genome open Signal Matrix was not found. Exiting..."
+        )
+        # return None
+    osm_path = os.path.join(OPEN_SIGNAL_FOLDER, osm_name)
+    if not os.path.exists(osm_path):
+        if not os.path.exists(OPEN_SIGNAL_FOLDER):
+            os.makedirs(OPEN_SIGNAL_FOLDER)
+        download_file(url=f"{OPEN_SIGNAL_URL}{osm_name}", path=osm_path)
+    return osm_path
+
+
+def run_all(
+    sample_name: str,
+    input_file: str,
+    input_type: str,
+    outfolder: str,
+    genome: str,
+    bedbase_config: str,
+    rfg_config: str = None,
+    narrowpeak: bool = False,
+    check_qc: bool = True,
+    standard_chrom: bool = False,
+    chrom_sizes: str = None,
+    open_signal_matrix: str = None,
+    ensdb: str = None,
+    sample_yaml: str = None,
+    just_db_commit: bool = False,
+    no_db_commit: bool = False,
+    force_overwrite: bool = False,
+    pm: pypiper.PipelineManager = None,
+    **kwargs,
+) -> NoReturn:
+    """
+    Run bedboss: bedmaker, bedqc and bedstat.
+    :param sample_name: Sample name [required]
+    :param input_file: Input file [required]
+    :param input_type: Input type [required] options: (bigwig|bedgraph|bed|bigbed|wig)
+    :param outfolder: Folder, where output should be saved  [required]
+    :param genome: genome_assembly of the sample. [required] options: (hg19, hg38) #TODO: add more
+    :param bedbase_config: a path to the bedbase configuration file. [required] #TODO: add example
+    :param open_signal_matrix: a full path to the openSignalMatrix required for the tissue [optional]
+    :param rfg_config: file path to the genome config file [optional]
+    :param narrowpeak: whether the regions are narrow
+        (transcription factor implies narrow, histone mark implies broad peaks) [optional]
+    :param check_qc: set True to run quality control during badmaking [optional] (default: True)
+    :param standard_chrom: Standardize chromosome names. [optional] (Default: False)
+    :param chrom_sizes: a full path to the chrom.sizes required for the bedtobigbed conversion [optional]
+    :param sample_yaml: a yaml config file with sample attributes to pass on MORE METADATA into the database [optional]
+    :param ensdb: a full path to the ensdb gtf file required for genomes not in GDdata [optional]
+        (basically genomes that's not in GDdata)
+    :param just_db_commit: whether just to commit the JSON to the database (default: False)
+    :param force_overwrite: force overwrite analysis
+    :param no_db_commit: whether the JSON commit to the database should be skipped (default: False)
+    :param pm: pypiper object
+    :return: NoReturn
+    """
+    _LOGGER.warning(f"Unused arguments: {kwargs}")
+    file_name = extract_file_name(input_file)
+    genome = standardize_genome_name(genome)
+
+    # find/download open signal matrix
+    if not open_signal_matrix or not os.path.exists(open_signal_matrix):
+        open_signal_matrix = get_osm_path(genome)
+
+    if not sample_yaml:
+        sample_yaml = f"{sample_name}.yaml"
+
+    output_bed = os.path.join(outfolder, BED_FOLDER_NAME, f"{file_name}.bed.gz")
+    output_bigbed = os.path.join(outfolder, BIGBED_FOLDER_NAME)
+
+    _LOGGER.info(f"output_bed = {output_bed}")
+    _LOGGER.info(f"output_bigbed = {output_bigbed}")
+
+    # set env for bedstat:
+    output_folder_bedstat = os.path.join(outfolder, "output")
+    os.environ["BEDBOSS_OUTPUT_PATH"] = output_folder_bedstat
+
+    BedMaker(
+        input_file=input_file,
+        input_type=input_type,
+        output_bed=output_bed,
+        output_bigbed=output_bigbed,
+        sample_name=sample_name,
+        genome=genome,
+        rfg_config=rfg_config,
+        narrowpeak=narrowpeak,
+        check_qc=check_qc,
+        standard_chrom=standard_chrom,
+        chrom_sizes=chrom_sizes,
+        pm=pm,
+    )
+
+    bedstat(
+        bedfile=output_bed,
+        outfolder=outfolder,
+        bedbase_config=bedbase_config,
+        genome=genome,
+        ensdb=ensdb,
+        open_signal_matrix=open_signal_matrix,
+        bigbed=output_bigbed,
+        sample_yaml=sample_yaml,
+        just_db_commit=just_db_commit,
+        no_db_commit=no_db_commit,
+        force_overwrite=force_overwrite,
+        pm=pm,
+    )
+
+
+def main(test_args: dict = None) -> NoReturn:
+    """
+    Run pipeline that was specified in as positional argument.
+    :param str test_args: one of the bedboss pipelines
+    """
+    parser = build_argparser()
+    if test_args:
+        args = Namespace(**test_args)
+    else:
+        args, _ = parser.parse_known_args()
+        global _LOGGER
+        _LOGGER = logmuse.logger_via_cli(args, make_root=True)
+
+    args_dict = vars(args)
+
+    pm = pypiper.PipelineManager(
+        name="bedboss-pipeline",
+        outfolder=args_dict.get("outfolder")
+        if args_dict.get("outfolder")
+        else "test_outfolder",
+        recover=True,
+        multi=True,
+        version=__version__,
+    )
+
+    if args_dict["command"] == "all":
+        run_all(pm=pm, **args_dict)
+    elif args_dict["command"] == "make":
+        BedMaker(pm=pm, **args_dict)
+    elif args_dict["command"] == "qc":
+        bedqc(pm=pm, **args_dict)
+    elif args_dict["command"] == "stat":
+        bedstat(pm=pm, **args_dict)
+    else:
+        parser.print_help()
+        # raise Exception("Incorrect pipeline name.")
+    pm.stop_pipeline()
diff --git a/bedboss/bedmaker/__init__.py b/bedboss/bedmaker/__init__.py