Merge pull request #9 from johnne/devel

Devel
biodiversitydata-se · Dec 18, 2021 · ca28f41 · ca28f41
2 parents 2c394bc + 0ff3a86
commit ca28f41
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 4 deletions.
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="coidb",
-    version="0.3.2",
+    version="0.3.3",
     author="John Sundh",
     url="https://github.com/NBISweden/coidb/",
     description="Workflow for downloading and formatting COI database",

diff --git a/src/coidb/Snakefile b/src/coidb/Snakefile
@@ -7,7 +7,7 @@ singularity: "docker://continuumio/miniconda3:4.9.2"
 # Validate config
 validate(config, "config.schema.yaml")
 
-localrules: coidb, download, filter, format
+localrules: coidb, download, filter, clean, format
 
 rule coidb:
     input: expand("bold_clustered.{w}.fasta", w=["assignTaxonomy", "addSpecies"])
@@ -92,12 +92,24 @@ rule cluster:
             {input.fasta} {output.fasta} > {log} 2>&1 
         """
 
+rule clean:
+    """
+    Cleans headers of sequences in clustered fasta file
+    """
+    input:
+        fasta = "bold_clustered.fasta"
+    output:
+        fasta = "bold_clustered_cleaned.fasta"
+    script:
+        "scripts/common.py"
+
+
 rule format:
     """
     Formats the clustered fasta file into DADA2-ready files
     """
     input:
-        fasta = "bold_clustered.fasta",
+        fasta = "bold_clustered_cleaned.fasta",
         info = "bold_info_filtered.tsv"
     output:
         assignTaxaFasta = "bold_clustered.assignTaxonomy.fasta",

diff --git a/src/coidb/scripts/common.py b/src/coidb/scripts/common.py
@@ -171,6 +171,22 @@ def filter(sm):
     info_df.to_csv(sm.output.info, header=True, index=True, sep="\t")
 
 
+def clean_fasta(sm):
+    """
+    Reformats fasta headers to strip vsearch specific strings
+
+    :param sm: snakemake object
+    :return:
+    """
+    from Bio import SeqIO
+    import re
+    with open(sm.input.fasta, 'r') as fhin, open(sm.output.fasta, 'w') as fhout:
+        for record in SeqIO.parse(fhin, "fasta"):
+            desc = (record.description).lstrip("centroid=")
+            desc = re.split(";seqs=\d+", desc)[0]
+            fhout.write(f">{desc}\n{record.seq}\n")
+
+
 def format_fasta(sm):
     """
     Format a fasta file into two output files, one for use with the
@@ -225,7 +241,8 @@ def format_fasta(sm):
 
 def main(sm):
     toolbox = {'filter': filter,
-               'format': format_fasta}
+               'format': format_fasta,
+               'clean': clean_fasta}
     toolbox[sm.rule](sm)