hydra-genetics · padraicc · Dec 4, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
@@ -12,41 +12,40 @@ on:
   workflow_dispatch:
 
 jobs:
-  integration-small-singularity:
-    name: integration small data set singularity
-    runs-on: ubuntu-latest
-    steps:
+  isteps:
       - uses: actions/checkout@v3
       - name: Set up Python 3.8
         uses: actions/setup-python@v3
         with:
           python-version: 3.8
-      - name: Setup Miniforge
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-            miniforge-variant: Miniforge3
-            miniforge-version: latest
-            activate-environment: my-env
-            use-mamba: true
-      - name: Set strict channel
+      - name: Build
+        run: |
+          echo "Free space:"
+          df -h
+      - name: Remove stuff
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo docker image prune --all --force
+      - name: Build
         run: |
-          conda config --set channel_priority strict
+          echo "Free space:"
+          df -h
+      - name: Install apptainer/singularity
+        run: |
+          sudo add-apt-repository -y ppa:apptainer/ppa && sudo apt install -y software-properties-common && sudo apt install -y apptainer
       - name: Install requirements.txt
         run: |
           pip install -r requirements.txt
       - name: Install requirements.test.txt
         run: |
           pip install -r requirements.test.txt
-      - name: Add conda to system path
-        run: |
-          echo $CONDA/bin >> $GITHUB_PATH
       - name: Integration test - small dataset
         working-directory: .tests/integration
         run: |
-          mamba install -c conda-forge -c bioconda apptainer=1.3.2
           snakemake -s ../../workflow/Snakefile -j 1 --show-failed-logs  --configfile config.yaml --use-singularity --singularity-args  " --cleanenv --bind /home/runner "
       - name: Integration test - small dataset PACBIO
         working-directory: .tests/integration
         run: |
-          mamba install -c conda-forge -c bioconda apptainer=1.3.2
           snakemake -s ../../workflow/Snakefile -j 1 --show-failed-logs  --configfile config_pacbio.yaml --use-singularity --singularity-args  " --cleanenv --bind /home/runner "
@@ -9,6 +9,13 @@ aligner: "pbmm2"
 reference:
   fasta: "reference/human_GRCh38.fasta"
 
+paraphase:
+  container: "docker://quay.io/pacbio/paraphase:3.1.1_build1"
+  genome: "38"
+  genes:
+    - pms2
+
+
 trgt_genotype:
   container: "docker://hydragenetics/trgt:1.0.0"
   bed: "reference/human_GRCh38_trgt.bed"

@@ -657,6 +657,29 @@ Manta calls structural variants (SVs) and indels from mapped paired-end sequenci
 
 ---
 
+## [paraphase](https://github.com/PacificBiosciences/paraphase)
+Paraphase is a Python tool that takes HiFi aligned BAMs as input (whole-genome or enrichment), phases haplotypes for genes of the same family, determines copy numbers and makes phased variant calls.
+
+### :snake: Rule
+
+#SNAKEMAKE_RULE_SOURCE__paraphase__paraphase#
+
+#### :left_right_arrow: input / output files
+
+#SNAKEMAKE_RULE_TABLE__paraphase__paraphase#
+
+### :wrench: Configuration
+
+#### Software settings (`config.yaml`)
+
+#CONFIGSCHEMA__paraphase#
+
+#### Resources settings (`resources.yaml`)
+
+#RESOURCESSCHEMA__paraphase#
+
+---
+
 ## Pindel
 
 ### [pindel_generate_config](https://github.com/genome/pindel)

@@ -9,7 +9,6 @@ include: "rules/sawfish.smk"
 include: "rules/tabix.smk"
 include: "rules/bgzip.smk"
 include: "rules/sniffles2.smk"
-include: "rules/trgt.smk"
 include: "rules/automap.smk"
 include: "rules/cnvkit.smk"
 include: "rules/cnvpytor.smk"
@@ -18,6 +17,7 @@ include: "rules/expansionhunter.smk"
 include: "rules/gatk.smk"
 include: "rules/jumble.smk"
 include: "rules/manta.smk"
+include: "rules/paraphase.smk"
 include: "rules/pindel.smk"
 include: "rules/purecn.smk"
 include: "rules/reviewer.smk"

@@ -291,6 +291,35 @@ def compile_output_list(wildcards):
         for suffix in files[prefix]
     ]
 
+    files = {
+        "cnv_sv/paraphase": ["bam"],
+        "cnv_sv/paraphase": [".bam.bai"],
+        "cnv_sv/paraphase": ["json"],
+    }
+    output_files += [
+        f"{prefix}/paraphase_{sample}_{unit_type}/{sample}_{unit_type}.paraphase.{suffix}"
+        for prefix in files.keys()
+        for sample in get_samples(samples)
+        for unit_type in get_unit_types(units, sample)
+        for platform in units.loc[(sample,)].platform
+        if platform in ["ONT", "PACBIO"]
+        for suffix in files[prefix]
+    ]
+
+    files = {
+        "cnv_sv/paraphase": ["vcf.gz"],
+    }
+    output_files += [
+        f"{prefix}/paraphase_{sample}_{unit_type}/{sample}_{unit_type}_paraphase_vcfs/{sample}_{unit_type}_{gene}.{suffix}"
+        for prefix in files.keys()
+        for sample in get_samples(samples)
+        for unit_type in get_unit_types(units, sample)
+        for platform in units.loc[(sample,)].platform
+        if platform in ["ONT", "PACBIO"]
+        for gene in config.get("paraphase", {}).get("genes", "")
+        for suffix in files[prefix]
+    ]
+
     files = {
         "cnv_sv/cnvkit_call": ["pathology.loh.cns"],
         "cnv_sv/cnvkit_diagram": ["pdf"],

@@ -0,0 +1,49 @@
+__author__ = "Padraic Corcoran"
+__copyright__ = "Copyright 2024, Padraic Corcoran"
+__email__ = "[email protected]"
+__license__ = "GPL-3"
+
+
+rule paraphase:
+    input:
+        bam=lambda wildcards: get_longread_bam(wildcards)[0],
+        bai=lambda wildcards: get_longread_bam(wildcards)[1],
+        ref=config.get("reference", {}).get("fasta", ""),
+    output:
+        bam="cnv_sv/paraphase/paraphase_{sample}_{type}/{sample}_{type}.paraphase.bam",
+        bai="cnv_sv/paraphase/paraphase_{sample}_{type}/{sample}_{type}.paraphase.bam.bai",
+        json="cnv_sv/paraphase/paraphase_{sample}_{type}/{sample}_{type}.paraphase.json",
+        vcf=expand(
+            "cnv_sv/paraphase/paraphase_{{sample}}_{{type}}/{{sample}}_{{type}}_paraphase_vcfs/{{sample}}_{{type}}_{gene}.vcf",
+            gene=config.get("paraphase", {}).get("genes", ""),
+        ),
+    params:
+        extra=config.get("paraphase", {}).get("extra", ""),
+        genome=config.get("paraphase", {}).get("genome", "38"),
+        prefix=lambda wildcards, output: "{}_{}".format(wildcards.sample, wildcards.type),
+        out=lambda wildcards, output: os.path.dirname(output.bam),
+        gene=lambda wildcards: "--gene " + ",".join(config.get("paraphase", {}).get("genes", "")),
+    log:
+        "cnv_sv/paraphase/{sample}_{type}.output.log",
+    benchmark:
+        repeat("cnv_sv/paraphase/{sample}_{type}.output.benchmark.tsv", config.get("paraphase", {}).get("benchmark_repeats", 1))
+    threads: config.get("paraphase", {}).get("threads", config["default_resources"]["threads"])
+    resources:
+        mem_mb=config.get("paraphase", {}).get("mem_mb", config["default_resources"]["mem_mb"]),
+        mem_per_cpu=config.get("paraphase", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]),
+        partition=config.get("paraphase", {}).get("partition", config["default_resources"]["partition"]),
+        threads=config.get("paraphase", {}).get("threads", config["default_resources"]["threads"]),
+        time=config.get("paraphase", {}).get("time", config["default_resources"]["time"]),
+    container:
+        config.get("paraphase", {}).get("container", config["default_container"])
+    message:
+        "{rule}: run paraphase on  {input.bam}"
+    shell:
+        "paraphase "
+        "--bam {input.bam} "
+        "--reference {input.ref} "
+        "--prefix {params.prefix} "
+        "--out {params.out} "
+        "{params.gene} "
+        "--threads {threads} "
+        "--genome {params.genome} &> {log}"
@@ -481,6 +481,20 @@ properties:
       container:
         type: string
         description: name or path to docker/singularity container
+
+  paraphase:
+    type: object
+    description: parameters for paraphase
+    properties:
+      benchmark_repeats:
+        type: integer
+        description: set number of times benchmark should be repeated
+      container:
+        type: string
+        description: name or path to docker/singularity container
+      extra:
+        type: string
+        description: parameters that should be forwarded
 
   pindel_generate_config:
     description: parameters for pindel_generate_config

@@ -586,6 +586,26 @@ properties:
       time:
         type: string
         description: max execution time
+
+  paraphase:
+    type: object
+    description: resource definitions for paraphase
+    properties:
+      mem_mb:
+        type: integer
+        description: max memory in MB to be available
+      mem_per_cpu:
+        type: integer
+        description: memory in MB used per cpu
+      partition:
+        type: string
+        description: partition to use on cluster
+      threads:
+        type: integer
+        description: number of threads to be available
+      time:
+        type: string
+        description: max execution time
 
   pindel_generate_config:
     description: resource definitions for pindel_generate_config

@@ -776,6 +776,42 @@ properties:
           wrk_dir:
             type: string
             description: working directory
+
+  paraphase:
+    type: object
+    description: input and output parameters for paraphase
+    properties:
+      input:
+        type: object
+        description: list of inputs
+        properties:
+          bam:
+            type: string
+            description: a 'bam' file
+          bai:
+            type: string
+            description: a 'bam' file index file
+          ref:
+            type: string
+            description: fasta genome reference file
+
+      output:
+        type: object
+        description: list of outputs
+        properties:
+          bam:
+            type: string
+            description: a haplotagged paraphase 'bam' file
+          bai:
+            type: string
+            description: a haplotagged paraphase 'bam' file index file
+          json:
+            type: string
+            description: a paraphase 'json' file 
+          vcf:
+            type: string
+            description: a paraphase 'vcf' file with variant and haplotype information for a single gene
+
 
   pindel_generate_config:
     type: object