diff --git a/setup.py b/setup.py index dacc28a..24cd198 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="coidb", - version="0.3.2", + version="0.3.3", author="John Sundh", url="https://github.com/NBISweden/coidb/", description="Workflow for downloading and formatting COI database", diff --git a/src/coidb/Snakefile b/src/coidb/Snakefile index 28f78fb..9ed4b6a 100644 --- a/src/coidb/Snakefile +++ b/src/coidb/Snakefile @@ -7,7 +7,7 @@ singularity: "docker://continuumio/miniconda3:4.9.2" # Validate config validate(config, "config.schema.yaml") -localrules: coidb, download, filter, format +localrules: coidb, download, filter, clean, format rule coidb: input: expand("bold_clustered.{w}.fasta", w=["assignTaxonomy", "addSpecies"]) @@ -92,12 +92,24 @@ rule cluster: {input.fasta} {output.fasta} > {log} 2>&1 """ +rule clean: + """ + Cleans headers of sequences in clustered fasta file + """ + input: + fasta = "bold_clustered.fasta" + output: + fasta = "bold_clustered_cleaned.fasta" + script: + "scripts/common.py" + + rule format: """ Formats the clustered fasta file into DADA2-ready files """ input: - fasta = "bold_clustered.fasta", + fasta = "bold_clustered_cleaned.fasta", info = "bold_info_filtered.tsv" output: assignTaxaFasta = "bold_clustered.assignTaxonomy.fasta", diff --git a/src/coidb/scripts/common.py b/src/coidb/scripts/common.py index 244e500..80bb809 100644 --- a/src/coidb/scripts/common.py +++ b/src/coidb/scripts/common.py @@ -171,6 +171,22 @@ def filter(sm): info_df.to_csv(sm.output.info, header=True, index=True, sep="\t") +def clean_fasta(sm): + """ + Reformats fasta headers to strip vsearch specific strings + + :param sm: snakemake object + :return: + """ + from Bio import SeqIO + import re + with open(sm.input.fasta, 'r') as fhin, open(sm.output.fasta, 'w') as fhout: + for record in SeqIO.parse(fhin, "fasta"): + desc = (record.description).lstrip("centroid=") + desc = re.split(";seqs=\d+", desc)[0] + fhout.write(f">{desc}\n{record.seq}\n") + + def format_fasta(sm): """ Format a fasta file into two output files, one for use with the @@ -225,7 +241,8 @@ def format_fasta(sm): def main(sm): toolbox = {'filter': filter, - 'format': format_fasta} + 'format': format_fasta, + 'clean': clean_fasta} toolbox[sm.rule](sm)