Merge pull request #6 from NBISweden/updates

Updates
NBISweden · Apr 23, 2024 · d1e7b87 · d1e7b87
2 parents e60c5e5 + 104f538
commit d1e7b87
Show file tree

Hide file tree

Showing 13 changed files with 296 additions and 30 deletions.
diff --git a/.gitignore b/.gitignore
@@ -139,4 +139,7 @@ slurm/err
 slurm/logs
 tmp
 .Rproj.user
-*.Rproj
+*.Rproj
+.quarto
+*/libs
+*.html
diff --git a/README.md b/README.md
@@ -140,6 +140,32 @@ Here's an example of running up to and including the filtering part of the workf
 snakemake --use-conda -j 10 -p filter
 ```
 
+##### Configuring login to JGI server
+
+The workflow automatically downloads transcript data from the [JGI Mycocosm](https://mycocosm.jgi.doe.gov/fungi/fungi.info.html) database. For this to work you must have [registered an account](https://contacts.jgi.doe.gov/registration/new). After registering, create a file in YAML format containing your login credentials:
+
+```yaml
+jgi_password: <your_password>
+jgi_user: <your_username/email>
+```
+
+Save this file as `jgi_login.yml` and pass it to the workflow with the `--configfile` flag:
+
+```bash
+snakemake --configfile jgi_login.yml <other flags>
+```
+
+Information about genomes found in the JGI Mycocosm database will be stored in `resources/JGI/genomes.tsv`. This file will be available after a dry-run of the workflow so you may inspect it before running the workflow. The file should look like this:
+
+| portal | Name | bp | genes |
+|--------|------|----|------|
+| Aaoar1 | Aaosphaeria arxii CBS 175.79 v1.0 | 38901049 | 14203 |
+| Abobi1 | Abortiporus biennis CCBS 521 v1.0 | 45165060 | 11987 |
+| Abobie1 | Abortiporus biennis CIRM-BRFM 1778 v1.0 | 33118568 | 11767 |
+
+The `portal` column contains the JGI portal name, `Name` is the name of the genome, `bp` is the size of the genome in base pairs and `genes` is the number of genes in the genome. You may filter this file to your liking to only download transcripts for the genomes you are interested in. Remember to save it as `resources/JGI/genomes.tsv` before running the workflow.
+
+
 #### Assembly
 
 Assemblies can be generated for single samples or by combining multiple samples

diff --git a/Snakefile b/Snakefile
@@ -6,7 +6,7 @@ import platform
 from source.utils.parse import parse_sample_list
 from snakemake.utils import min_version, validate
 from snakemake.exceptions import WorkflowError
-
+from source.utils.list_jgi_genomes import parse_genomes
 
 def parse_validation_error(e):
     """
@@ -33,11 +33,6 @@ if not os.getenv("TMPDIR"):
     os.environ["TMPDIR"] = "tmp"
     os.makedirs(os.environ["TMPDIR"],exist_ok=True)
 
-wildcard_constraints:
-    sample_id = "[A-Za-z0-9_\-\.]+",
-    assembler = "megahit|trinity|transabyss",
-    filter_source = "unfiltered|filtered|taxmapper|bowtie2"
-
 # Set default config and validate against schema
 if os.path.exists("config.yml"):
     configfile: "config.yml"
@@ -49,14 +44,23 @@ except WorkflowError as e:
 
 workdir: config["workdir"]
 
+# Parse samples and assemblies
+samples, map_dict, assemblies = parse_sample_list(config["sample_file_list"], config)
+# Parse JGI genomes from fungi info url, also save to file for quick re-use in future runs
+genomes = parse_genomes(config["fungi_info"], f="resources/JGI/genomes.tsv")
+
+wildcard_constraints:
+    sample_id = f"({'|'.join(list(samples.keys()))})",
+    assembler = "megahit|trinity|transabyss",
+    filter_source = "unfiltered|filtered|taxmapper|bowtie2",
+    portals = f"({'|'.join(list(genomes.index.tolist()))})",
+
 # Get environment info
 pythonpath = sys.executable
 envdir = '/'.join(pythonpath.split("/")[0:-2])
 system = platform.system()
 config["system"] = system
 
-# Parse samples and assemblies
-samples, map_dict, assemblies = parse_sample_list(config["sample_file_list"], config)
 
 # Include rules
 include: "source/rules/db.smk"

diff --git a/config.yml b/config.yml
@@ -5,7 +5,7 @@ sample_file_list: "sample_list.tsv"
 datadir: "data/raw"
 mem_per_cpu: 6
 ## data for filtering
-fungi_url: "https://zenodo.org/record/3550762/files/fungi_transcripts.fasta.gz?download=1"
+fungi_info: "https://mycocosm.jgi.doe.gov/fungi/fungi.info.html"
 host_fna_url: "https://zenodo.org/record/3550762/files/spruce.fna.gz?download=1"
 host_gtf_url: ""
 ## mapping

diff --git a/config/config.schema.yaml b/config/config.schema.yaml
@@ -84,10 +84,10 @@ properties:
       'both_mapped' = requires that both ends are mapped or 'one_mapped' = keep paired reads even if only one end maps"
     default: "both_mapped"
     enum: ["concordant","both_mapped","one_mapped"]
-  fungi_transcript_url:
+  fungi_info:
     type: string
-    description: Dataset url for fungal transcripts to use in filtering
-    default: "https://zenodo.org/record/3550762/files/fungi_transcripts.fasta.gz?download=1"
+    description: URL to fungi info page
+    default: https://mycocosm.jgi.doe.gov/fungi/fungi.info.html
   spruce_transcript_url:
     type: string
     description: Dataset url for spruce transcripts to use in filtering

diff --git a/environment.yml b/environment.yml
@@ -1,13 +1,16 @@
 name: fungal-trans
 channels:
-  - bioconda
   - conda-forge
+  - bioconda
   - defaults
 dependencies:
-  - python=3.8
-  - snakemake-minimal=6.2.1
-  - biopython=1.78
-  - pandas=1.3.4
+  - python
+  - snakemake-minimal
+  - biopython
+  - pandas
+  - lxml
+  - pycurl
+  - tqdm
   - multiqc=1.11
   - bowtie2=2.4.4
   - samtools=1.14

diff --git a/envs/bowtie2.yaml b/envs/bowtie2.yaml
@@ -0,0 +1,8 @@
+name: bowtie2
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bowtie2=2.5.3
+  - samtools=1.20
diff --git a/local/config.yaml b/local/config.yaml
@@ -0,0 +1,4 @@
+use-conda: True
+keep-going: True
+rerun-triggers: mtime
+cores: 1
diff --git a/source/rules/annotate_co.smk b/source/rules/annotate_co.smk
@@ -371,7 +371,7 @@ rule create_blob_co:
     conda: "../../envs/blobtools.yaml"
     shell:
         """
-        blobtools create --nodes {input.nodes} --names {input.names} -i {input.fa} -b {input.bam} -t {input.hit} \ 
+        blobtools create --nodes {input.nodes} --names {input.names} -i {input.fa} -b {input.bam} -t {input.hit} \
             --title {wildcards.sample_id} -o {params.prefix}
         """
 

diff --git a/source/rules/db.smk b/source/rules/db.smk
@@ -6,7 +6,8 @@ localrules:
     get_kegg_files,
     download_refseq_db,
     download_host,
-    download_fungi_transcripts,
+    init_jgi,
+    download_jgi_transcripts,
     prepare_diamond_JGI
 
 ## NCBI Taxonomy ##
@@ -112,18 +113,62 @@ rule press_dbCAN:
         """
 
 ## Fungal transcripts ##
-rule download_fungi_transcripts:
+rule init_jgi:
     """
-    Downloads snapshot of fungal transcripts obtained from JGI 
+    Creates a cookie file for JGI authentication using user-defined credentials
     """
     output:
-        temp("resources/fungi/fungi_transcripts.fasta")
+        cookies="resources/JGI/cookies"
+    log:
+        "resources/JGI/init_jgi.log"
+    shadow: "minimal"
     params:
-        url = config["fungi_url"]
+        user_name = config["jgi_user"],
+        password = config["jgi_password"]
+    retries: 3
+    shell:
+        """
+        curl 'https://signon.jgi.doe.gov/signon/create' \
+            --data-urlencode 'login={params.user_name}' \
+            --data-urlencode 'password={params.password}' -c {output.cookies} > /dev/null 2>{log}
+        """
+
+rule download_jgi_transcripts:
+    """
+    For each 'portal', download the filtered transcripts 
+    """
+    output:
+        temp("resources/JGI/genomes/{portal}.transcripts.fna.gz"),
+    input:
+        cookies=rules.init_jgi.output.cookies,
+    log:
+        "resources/JGI/genomes/download_jgi_transcripts.{portal}.log"
+    params:
+        tmpdir = "$TMPDIR/{portal}",
+        xml = "$TMPDIR/{portal}/files.xml",
+        output = "$TMPDIR/{portal}/transcripts.fna.gz"
+    retries: 3
+    shell:
+        """
+        mkdir -p {params.tmpdir}
+        python source/utils/download_jgi_transcripts.py -p {wildcards.portal} -c {input.cookies} -o {params.output} 2>{log}
+        mv {params.output} {output}
+        rm -rf {params.tmpdir}
+        """
+
+rule concat_transcripts:
+    """
+    Concatenates all transcripts downloaded
+    """
+    output:
+        "resources/fungi/fungi_transcripts.fasta.gz"
+    input:
+        expand(rules.download_jgi_transcripts.output, portal=genomes.index.tolist())
+    log:
+        "resources/fungi/concat_transcripts.log"
     shell:
         """
-        curl -L -s -o {output[0]}.gz {params.url}
-        gunzip {output[0]}.gz
+        cat {input} > {output} 
         """
 
 ## Host data ##

diff --git a/source/rules/filter.smk b/source/rules/filter.smk
@@ -25,27 +25,29 @@ include: "paired_strategy.smk"
 
 rule bowtie_build_fungi:
     input:
-        "resources/fungi/fungi_transcripts.fasta"
+        rules.concat_transcripts.output
     output:
         expand("resources/fungi/fungi_transcripts.fasta.{index}.bt2l",
                index=range(1,5))
     log:
         "resources/fungi/bowtie2.log"
     params:
-        tmp = "$TMPDIR/fungi_transcripts/fungi_transcripts.fasta",
+        fasta = "$TMPDIR/fungi_transcripts/fungi_transcripts.fasta",
         tmpdir = "$TMPDIR/fungi_transcripts",
         outdir = "resources/fungi/"
     threads: 20
     resources:
         runtime = lambda wildcards, attempt: attempt**2*60*120
+    conda: "../../envs/bowtie2.yaml"
     shell:
         """
         mkdir -p {params.tmpdir}
+        gunzip -c {input} > {params.fasta}
         bowtie2-build \
             --threads {threads} \
             --large-index {input} \
-            {params.tmp} >{log} 2>&1
-        mv {params.tmp}*.bt2l {params.outdir}/
+            {params.fasta} >{log} 2>&1
+        mv {params.fasta}*.bt2l {params.outdir}/
         rm -r {params.tmpdir}
         """
 
@@ -83,6 +85,7 @@ rule bowtie_build_host:
         "resources/host/bowtie2.log"
     resources:
         runtime = lambda wildcards, attempt: attempt**2*60
+    conda: "../../envs/bowtie2.yaml"
     shell:
         """
         bowtie2-build \

diff --git a/source/utils/download_jgi_transcripts.py b/source/utils/download_jgi_transcripts.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+import xml.etree.ElementTree as ET
+import pycurl
+from datetime import datetime
+import pytz
+import tempfile
+import os
+from argparse import ArgumentParser
+import sys
+
+
+def tz_convert(timestamp):
+    """
+    Converts timestamp to UTC
+    """
+    timezones = {
+        "PDT": "America/Los_Angeles",
+        "PST": "America/Los_Angeles"
+                 }
+    tz = timestamp.split(" ")[-2]
+    timestamp = " ".join(timestamp.split(" ")[0:-2]+timestamp.split(" ")[-1:])
+    t = datetime.strptime(timestamp, "%a %b %d %H:%M:%S %Y")
+    tz = pytz.timezone(timezones[tz])
+    return tz.localize(t).astimezone(pytz.utc)
+
+def get_xml(portal, cookie):
+    """
+    Get XML tree from JGI portal
+    The xml file is stored in a temporary file and read into an ElementTree object
+    """
+    url=f"https://genome-downloads.jgi.doe.gov/portal/ext-api/downloads/get-directory?organism={portal}"
+    with tempfile.TemporaryFile() as fp:
+        c = pycurl.Curl()
+        c.setopt(c.COOKIEFILE, cookie)
+        c.setopt(c.URL, url)
+        c.setopt(c.WRITEDATA, fp)
+        c.setopt(c.VERBOSE, False)
+        c.perform()
+        c.close()
+        fp.seek(0)
+        tree = ET.fromstring(fp.read())
+    return tree
+
+def find_files(root):
+    """
+    Find files in the XML tree
+    Searches for the latest transcript file in the folder "Filtered Models ("best")/Transcripts"
+    """
+    f = ""
+    for child in root:
+        if child.tag == "folder" and child.attrib["name"] == "Files":
+            for item in child:
+                if item.tag == "folder" and item.attrib["name"] == "Annotation":
+                    for annot_item in item:
+                        if annot_item.tag == "folder" and annot_item.attrib["name"] == 'Filtered Models ("best")':
+                            for filtered_item in annot_item:
+                                if filtered_item.tag == "folder" and filtered_item.attrib["name"] == "Transcripts":
+                                    timestamp = False
+                                    for transcript_item in filtered_item:
+                                        if transcript_item.tag == "file":
+                                            filename=transcript_item.attrib["filename"]
+                                            if "transcripts" in filename and filename.endswith(".nt.fasta.gz"):
+                                                url = transcript_item.attrib["url"]
+                                                _timestamp = tz_convert(transcript_item.attrib["timestamp"])
+                                                if timestamp and _timestamp>timestamp:
+                                                    sys.stderr.write(f"Found newer file {filename} with timestamp {_timestamp}\n")
+                                                    f = url
+                                                elif not timestamp:
+                                                    sys.stderr.write(f"Found file {filename} with timestamp {_timestamp}\n")
+                                                    f = url
+                                                    timestamp = _timestamp
+                                    break
+    return f
+
+def main(args):
+    """
+    Main function
+    """
+    root = get_xml(args.portal, args.cookie)
+    f = find_files(root)
+    url = f"{args.base}{f}"
+    if not args.outfile:
+        outfile = os.path.basename(f)
+    else:
+        outfile = args.outfile
+    sys.stderr.write(f"Downloading {os.path.basename(f)} to {outfile}\n")
+    with open(outfile, 'wb') as fhout:
+        c = pycurl.Curl()
+        c.setopt(c.COOKIEFILE, args.cookie)
+        c.setopt(c.URL, url)
+        c.setopt(c.WRITEDATA, fhout)
+        c.setopt(c.VERBOSE, False)
+        c.perform()
+        c.close()
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(description='Download all genomes from JGI Mycocosm')
+    parser.add_argument('-p', '--portal', help='Portal shorthand name (e.g. Aaoar1)', required=True)
+    parser.add_argument('-c', '--cookie', help="Cookie file for JGI", )
+    parser.add_argument('-o', '--outfile', help='Output file name. If not given, the file will be saved in the current directory with the remote filename')
+    parser.add_argument('-b', '--base', help='Base URL for JGI downloads (default: https://genome-downloads.jgi.doe.gov)', default="https://genome-downloads.jgi.doe.gov")
+    args = parser.parse_args()
+    main(args)