Skip to content

Commit

Permalink
Merge pull request #20 from biodiversitydata-se/devel
Browse files Browse the repository at this point in the history
Devel
  • Loading branch information
johnne authored Dec 22, 2022
2 parents 8591508 + 67fd3ee commit aad956d
Show file tree
Hide file tree
Showing 9 changed files with 439 additions and 172 deletions.
6 changes: 3 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
name: coidb
channels:
- bioconda
- conda-forge
- bioconda
- defaults
dependencies:
- python>=3.7
- python
- biopython
- vsearch
- tqdm
- pandas
- snakemake
- snakemake>=7.8
- seqkit
- importlib_resources
- unzip
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = coidb
version = 0.4.7
version = 0.4.8
author = John Sundh
author_email = [email protected]
description = Workflow for downloading and formatting COI database
Expand All @@ -22,6 +22,7 @@ python_requires = >=3.7
include_package_data = True
scripts =
src/coidb/scripts/cluster_bold.py
src/coidb/scripts/format_sintax.py
install_requires =
snakemake
biopython
Expand Down
132 changes: 86 additions & 46 deletions src/coidb/Snakefile
Original file line number Diff line number Diff line change
@@ -1,56 +1,74 @@
from snakemake.utils import validate


# this container defines the underlying OS for each job when using the workflow
# with --use-conda --use-singularity
singularity: "docker://continuumio/miniconda3:4.9.2"


# Validate config
validate(config, "config.schema.yaml")

nrows = None
if config["testing"]["nrows"] > 0:
nrows = config["testing"]["nrows"]

localrules: coidb, download, filter, clean, format

localrules:
coidb,
download,
filter,
clean,
format_sintax,
format_dada2,


wildcard_constraints:
textfile = "occurrences.txt|dna.txt|Taxon.tsv",
zipfile = "bold.zip|backbone.zip"
textfile="occurrences.txt|dna.txt|Taxon.tsv",
zipfile="bold.zip|backbone.zip",


textfile_dict = {
"Taxon.tsv": "backbone.zip",
"occurrences.txt": "bold.zip",
"dna.txt": "bold.zip",
}

textfile_dict = {'Taxon.tsv': 'backbone.zip',
'occurrences.txt': 'bold.zip',
'dna.txt': 'bold.zip'}

rule coidb:
input: expand("bold_clustered.{w}.fasta", w=["assignTaxonomy", "addSpecies"])
input:
expand("bold_clustered.{w}.fasta", w=["assignTaxonomy", "addSpecies", "sintax"]),


rule download_zipfile:
"""
Download zipfile with database sequences + info
"""
output:
"{zipfile}"
"{zipfile}",
log:
"logs/download.{zipfile}.log"
"logs/download.{zipfile}.log",
params:
url = lambda wildcards: config["database"][wildcards.zipfile]
url=lambda wildcards: config["database"][wildcards.zipfile],
shell:
"""
curl -L -o $TMPDIR/{wildcards.zipfile} {params.url} > {log} 2>&1
mv $TMPDIR/{wildcards.zipfile} {output[0]}
"""


rule download:
input:
textfile_dict.values()
textfile_dict.values(),


rule extract_zipfile:
input:
lambda wildcards: textfile_dict[wildcards.textfile]
lambda wildcards: textfile_dict[wildcards.textfile],
output:
"{textfile}"
"{textfile}",
log:
"logs/extract.{textfile}.log"
"logs/extract.{textfile}.log",
shell:
"""
f=$(unzip -l {input[0]} | grep -w {output[0]} | rev | cut -f1 -d ' ' | rev)
Expand All @@ -61,7 +79,8 @@ rule extract_zipfile:

rule extract:
input:
textfile_dict.keys()
textfile_dict.keys(),


rule filter_data:
"""
Expand All @@ -72,33 +91,34 @@ rule filter_data:
input:
"occurrences.txt",
"dna.txt",
"Taxon.tsv"
"Taxon.tsv",
output:
info = "bold_info_filtered.tsv",
fasta = "bold.fasta",
info="bold_info_filtered.tsv",
fasta="bold.fasta",
log:
"bold_info_non-unique-taxa.txt"
"bold_info_non-unique-taxa.txt",
params:
genes = config["database"]["gene"],
filter_taxa = config["database"]["taxa"],
filter_rank = config["database"]["rank"],
ranks = config["database"]["ranks"],
tmpf = "$TMPDIR/bold_filtered.fasta",
nrows = nrows
genes=config["database"]["gene"],
filter_taxa=config["database"]["taxa"],
filter_rank=config["database"]["rank"],
ranks=config["database"]["ranks"],
tmpf="$TMPDIR/bold_filtered.fasta",
nrows=nrows,
script:
"scripts/common.py"


rule remove_non_standard:
input:
"bold.fasta"
"bold.fasta",
output:
"bold_filtered.fasta"
"bold_filtered.fasta",
log:
"logs/remove_non_standard.log"
"logs/remove_non_standard.log",
params:
tmpfile = "$TMPDIR/bold_seqkit_cleaned.fasta",
ids = "$TMPDIR/bold_non_standard_ids.txt",
fastafile = "$TMPDIR/bold_filtered.fasta"
tmpfile="$TMPDIR/bold_seqkit_cleaned.fasta",
ids="$TMPDIR/bold_non_standard_ids.txt",
fastafile="$TMPDIR/bold_filtered.fasta",
shell:
"""
exec &> {log}
Expand All @@ -111,55 +131,75 @@ rule remove_non_standard:
seqkit stats {input[0]} {params.tmpfile} {output[0]}
"""


rule filter:
input:
"bold_info_filtered.tsv",
"bold_filtered.fasta"
"bold_filtered.fasta",


rule cluster:
"""
Cluster the filtered fasta file using vsearch
"""
input:
fasta = "bold_filtered.fasta"
fasta="bold_filtered.fasta",
output:
fasta = "bold_clustered.fasta"
fasta="bold_clustered.fasta",
log:
"logs/bold/cluster.log"
"logs/bold/cluster.log",
threads: 20
resources:
runtime = lambda wildcards, attempt: attempt ** 2 * 60 * 10
runtime=lambda wildcards, attempt: attempt**2 * 60 * 10,
params:
pid = config["database"]["pid"]
pid=config["database"]["pid"],
shell:
"""
cluster_bold.py --threads {threads} --pid {params.pid} \
{input.fasta} {output.fasta} > {log} 2>&1
"""


rule clean:
"""
Cleans headers of sequences in clustered fasta file
"""
input:
fasta = "bold_clustered.fasta"
fasta="bold_clustered.fasta",
output:
fasta = "bold_clustered_cleaned.fasta"
fasta="bold_clustered_cleaned.fasta",
script:
"scripts/common.py"


rule format:
rule format_dada2:
"""
Formats the clustered fasta file into DADA2-ready files
"""
input:
fasta = "bold_clustered_cleaned.fasta",
info = "bold_info_filtered.tsv"
fasta="bold_clustered_cleaned.fasta",
info="bold_info_filtered.tsv",
output:
assignTaxaFasta = "bold_clustered.assignTaxonomy.fasta",
addSpeciesFasta = "bold_clustered.addSpecies.fasta"
assignTaxaFasta="bold_clustered.assignTaxonomy.fasta",
addSpeciesFasta="bold_clustered.addSpecies.fasta",
params:
ranks = config["database"]["ranks"]
ranks=config["database"]["ranks"],
script:
"scripts/common.py"
"scripts/common.py"


rule format_sintax:
input:
fasta="bold_clustered_cleaned.fasta",
info="bold_info_filtered.tsv",
output:
fasta="bold_clustered.sintax.fasta",
log:
"logs/bold/format_sintax.log",
params:
ranks=config["sintax"]["ranks"],
replace=config["sintax"]["replace_ranks"],
shell:
"""
format_sintax.py {input.fasta} {input.info} {output.fasta} --ranks {params.ranks} --replace_rank {params.replace} 2>{log}
"""
72 changes: 47 additions & 25 deletions src/coidb/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,41 +15,63 @@ class SnakemakeError(Exception):


def run(args):
with resource_path('coidb', "Snakefile") as snakefile_path:
with resource_path("coidb", "Snakefile") as snakefile_path:
forcerun = []
if args.force:
forcerun = args.targets
success = snakemake(
snakefile_path, targets=args.targets, dryrun=args.dryrun,
cores=args.cores, configfiles=args.config_file,
cluster_config=args.cluster_config, workdir=args.workdir,
printshellcmds=args.printshellcmds, unlock=args.unlock,
snakefile_path,
targets=args.targets,
dryrun=args.dryrun,
cores=args.cores,
configfiles=args.config_file,
cluster_config=args.cluster_config,
workdir=args.workdir,
printshellcmds=args.printshellcmds,
unlock=args.unlock,
forcerun=forcerun,
rerun_triggers="mtime",
)
return success
if not success:
raise SnakemakeError()


def main():
parser = ArgumentParser()
parser.add_argument("targets", nargs='*', default=[],
help="File(s) to create or steps to run. If omitted, "
"the full pipeline is run.")
parser.add_argument("-n", "--dryrun", action="store_true",
help="Only print what to do, don't do anything [False]")
parser.add_argument("-j", "--cores", type=int, default=4,
help="Number of cores to run with [4]")
parser.add_argument("-f", "--force", action="store_true",
help="Force workflow run")
parser.add_argument("-u", "--unlock", action="store_true",
help="Unlock working directory")
parser.add_argument("-c", "--config-file", nargs='*', default=[],
help="Path to configuration file")
parser.add_argument("--cluster-config", type=str,
help="Path to cluster config (for running on SLURM)")
parser.add_argument("--workdir", type=str,
help="Working directory. Defaults to current dir")
parser.add_argument("-p", "--printshellcmds", action="store_true",
help="Print shell commands")
parser.add_argument(
"targets",
nargs="*",
default=[],
help="File(s) to create or steps to run. If omitted, "
"the full pipeline is run.",
)
parser.add_argument(
"-n",
"--dryrun",
action="store_true",
help="Only print what to do, don't do anything [False]",
)
parser.add_argument(
"-j", "--cores", type=int, default=4, help="Number of cores to run with [4]"
)
parser.add_argument("-f", "--force", action="store_true", help="Force workflow run")
parser.add_argument(
"-u", "--unlock", action="store_true", help="Unlock working directory"
)
parser.add_argument(
"-c", "--config-file", nargs="*", default=[], help="Path to configuration file"
)
parser.add_argument(
"--cluster-config",
type=str,
help="Path to cluster config (for running on SLURM)",
)
parser.add_argument(
"--workdir", type=str, help="Working directory. Defaults to current dir"
)
parser.add_argument(
"-p", "--printshellcmds", action="store_true", help="Print shell commands"
)
args = parser.parse_args()
run(args)
run(args)
Loading

0 comments on commit aad956d

Please sign in to comment.