Skip to content

Commit

Permalink
Merge pull request #9 from johnne/devel
Browse files Browse the repository at this point in the history
Devel
  • Loading branch information
johnne authored Dec 18, 2021
2 parents 2c394bc + 0ff3a86 commit ca28f41
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 4 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="coidb",
version="0.3.2",
version="0.3.3",
author="John Sundh",
url="https://github.com/NBISweden/coidb/",
description="Workflow for downloading and formatting COI database",
Expand Down
16 changes: 14 additions & 2 deletions src/coidb/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ singularity: "docker://continuumio/miniconda3:4.9.2"
# Validate config
validate(config, "config.schema.yaml")

localrules: coidb, download, filter, format
localrules: coidb, download, filter, clean, format

rule coidb:
input: expand("bold_clustered.{w}.fasta", w=["assignTaxonomy", "addSpecies"])
Expand Down Expand Up @@ -92,12 +92,24 @@ rule cluster:
{input.fasta} {output.fasta} > {log} 2>&1
"""

rule clean:
"""
Cleans headers of sequences in clustered fasta file
"""
input:
fasta = "bold_clustered.fasta"
output:
fasta = "bold_clustered_cleaned.fasta"
script:
"scripts/common.py"


rule format:
"""
Formats the clustered fasta file into DADA2-ready files
"""
input:
fasta = "bold_clustered.fasta",
fasta = "bold_clustered_cleaned.fasta",
info = "bold_info_filtered.tsv"
output:
assignTaxaFasta = "bold_clustered.assignTaxonomy.fasta",
Expand Down
19 changes: 18 additions & 1 deletion src/coidb/scripts/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,22 @@ def filter(sm):
info_df.to_csv(sm.output.info, header=True, index=True, sep="\t")


def clean_fasta(sm):
"""
Reformats fasta headers to strip vsearch specific strings
:param sm: snakemake object
:return:
"""
from Bio import SeqIO
import re
with open(sm.input.fasta, 'r') as fhin, open(sm.output.fasta, 'w') as fhout:
for record in SeqIO.parse(fhin, "fasta"):
desc = (record.description).lstrip("centroid=")
desc = re.split(";seqs=\d+", desc)[0]
fhout.write(f">{desc}\n{record.seq}\n")


def format_fasta(sm):
"""
Format a fasta file into two output files, one for use with the
Expand Down Expand Up @@ -225,7 +241,8 @@ def format_fasta(sm):

def main(sm):
toolbox = {'filter': filter,
'format': format_fasta}
'format': format_fasta,
'clean': clean_fasta}
toolbox[sm.rule](sm)


Expand Down

0 comments on commit ca28f41

Please sign in to comment.