Skip to content

Commit

Permalink
Merge pull request #6 from NBISweden/updates
Browse files Browse the repository at this point in the history
Updates
  • Loading branch information
johnne authored Apr 23, 2024
2 parents e60c5e5 + 104f538 commit d1e7b87
Show file tree
Hide file tree
Showing 13 changed files with 296 additions and 30 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,7 @@ slurm/err
slurm/logs
tmp
.Rproj.user
*.Rproj
*.Rproj
.quarto
*/libs
*.html
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,32 @@ Here's an example of running up to and including the filtering part of the workf
snakemake --use-conda -j 10 -p filter
```

##### Configuring login to JGI server

The workflow automatically downloads transcript data from the [JGI Mycocosm](https://mycocosm.jgi.doe.gov/fungi/fungi.info.html) database. For this to work you must have [registered an account](https://contacts.jgi.doe.gov/registration/new). After registering, create a file in YAML format containing your login credentials:

```yaml
jgi_password: <your_password>
jgi_user: <your_username/email>
```
Save this file as `jgi_login.yml` and pass it to the workflow with the `--configfile` flag:

```bash
snakemake --configfile jgi_login.yml <other flags>
```

Information about genomes found in the JGI Mycocosm database will be stored in `resources/JGI/genomes.tsv`. This file will be available after a dry-run of the workflow so you may inspect it before running the workflow. The file should look like this:

| portal | Name | bp | genes |
|--------|------|----|------|
| Aaoar1 | Aaosphaeria arxii CBS 175.79 v1.0 | 38901049 | 14203 |
| Abobi1 | Abortiporus biennis CCBS 521 v1.0 | 45165060 | 11987 |
| Abobie1 | Abortiporus biennis ​CIRM-BRFM 1778 v1.0 | 33118568 | 11767 |

The `portal` column contains the JGI portal name, `Name` is the name of the genome, `bp` is the size of the genome in base pairs and `genes` is the number of genes in the genome. You may filter this file to your liking to only download transcripts for the genomes you are interested in. Remember to save it as `resources/JGI/genomes.tsv` before running the workflow.


#### Assembly

Assemblies can be generated for single samples or by combining multiple samples
Expand Down
20 changes: 12 additions & 8 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import platform
from source.utils.parse import parse_sample_list
from snakemake.utils import min_version, validate
from snakemake.exceptions import WorkflowError

from source.utils.list_jgi_genomes import parse_genomes

def parse_validation_error(e):
"""
Expand All @@ -33,11 +33,6 @@ if not os.getenv("TMPDIR"):
os.environ["TMPDIR"] = "tmp"
os.makedirs(os.environ["TMPDIR"],exist_ok=True)

wildcard_constraints:
sample_id = "[A-Za-z0-9_\-\.]+",
assembler = "megahit|trinity|transabyss",
filter_source = "unfiltered|filtered|taxmapper|bowtie2"

# Set default config and validate against schema
if os.path.exists("config.yml"):
configfile: "config.yml"
Expand All @@ -49,14 +44,23 @@ except WorkflowError as e:

workdir: config["workdir"]

# Parse samples and assemblies
samples, map_dict, assemblies = parse_sample_list(config["sample_file_list"], config)
# Parse JGI genomes from fungi info url, also save to file for quick re-use in future runs
genomes = parse_genomes(config["fungi_info"], f="resources/JGI/genomes.tsv")

wildcard_constraints:
sample_id = f"({'|'.join(list(samples.keys()))})",
assembler = "megahit|trinity|transabyss",
filter_source = "unfiltered|filtered|taxmapper|bowtie2",
portals = f"({'|'.join(list(genomes.index.tolist()))})",

# Get environment info
pythonpath = sys.executable
envdir = '/'.join(pythonpath.split("/")[0:-2])
system = platform.system()
config["system"] = system

# Parse samples and assemblies
samples, map_dict, assemblies = parse_sample_list(config["sample_file_list"], config)

# Include rules
include: "source/rules/db.smk"
Expand Down
2 changes: 1 addition & 1 deletion config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ sample_file_list: "sample_list.tsv"
datadir: "data/raw"
mem_per_cpu: 6
## data for filtering
fungi_url: "https://zenodo.org/record/3550762/files/fungi_transcripts.fasta.gz?download=1"
fungi_info: "https://mycocosm.jgi.doe.gov/fungi/fungi.info.html"
host_fna_url: "https://zenodo.org/record/3550762/files/spruce.fna.gz?download=1"
host_gtf_url: ""
## mapping
Expand Down
6 changes: 3 additions & 3 deletions config/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,10 @@ properties:
'both_mapped' = requires that both ends are mapped or 'one_mapped' = keep paired reads even if only one end maps"
default: "both_mapped"
enum: ["concordant","both_mapped","one_mapped"]
fungi_transcript_url:
fungi_info:
type: string
description: Dataset url for fungal transcripts to use in filtering
default: "https://zenodo.org/record/3550762/files/fungi_transcripts.fasta.gz?download=1"
description: URL to fungi info page
default: https://mycocosm.jgi.doe.gov/fungi/fungi.info.html
spruce_transcript_url:
type: string
description: Dataset url for spruce transcripts to use in filtering
Expand Down
13 changes: 8 additions & 5 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
name: fungal-trans
channels:
- bioconda
- conda-forge
- bioconda
- defaults
dependencies:
- python=3.8
- snakemake-minimal=6.2.1
- biopython=1.78
- pandas=1.3.4
- python
- snakemake-minimal
- biopython
- pandas
- lxml
- pycurl
- tqdm
- multiqc=1.11
- bowtie2=2.4.4
- samtools=1.14
Expand Down
8 changes: 8 additions & 0 deletions envs/bowtie2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: bowtie2
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bowtie2=2.5.3
- samtools=1.20
4 changes: 4 additions & 0 deletions local/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
use-conda: True
keep-going: True
rerun-triggers: mtime
cores: 1
2 changes: 1 addition & 1 deletion source/rules/annotate_co.smk
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ rule create_blob_co:
conda: "../../envs/blobtools.yaml"
shell:
"""
blobtools create --nodes {input.nodes} --names {input.names} -i {input.fa} -b {input.bam} -t {input.hit} \
blobtools create --nodes {input.nodes} --names {input.names} -i {input.fa} -b {input.bam} -t {input.hit} \
--title {wildcards.sample_id} -o {params.prefix}
"""

Expand Down
59 changes: 52 additions & 7 deletions source/rules/db.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ localrules:
get_kegg_files,
download_refseq_db,
download_host,
download_fungi_transcripts,
init_jgi,
download_jgi_transcripts,
prepare_diamond_JGI

## NCBI Taxonomy ##
Expand Down Expand Up @@ -112,18 +113,62 @@ rule press_dbCAN:
"""

## Fungal transcripts ##
rule download_fungi_transcripts:
rule init_jgi:
"""
Downloads snapshot of fungal transcripts obtained from JGI
Creates a cookie file for JGI authentication using user-defined credentials
"""
output:
temp("resources/fungi/fungi_transcripts.fasta")
cookies="resources/JGI/cookies"
log:
"resources/JGI/init_jgi.log"
shadow: "minimal"
params:
url = config["fungi_url"]
user_name = config["jgi_user"],
password = config["jgi_password"]
retries: 3
shell:
"""
curl 'https://signon.jgi.doe.gov/signon/create' \
--data-urlencode 'login={params.user_name}' \
--data-urlencode 'password={params.password}' -c {output.cookies} > /dev/null 2>{log}
"""

rule download_jgi_transcripts:
"""
For each 'portal', download the filtered transcripts
"""
output:
temp("resources/JGI/genomes/{portal}.transcripts.fna.gz"),
input:
cookies=rules.init_jgi.output.cookies,
log:
"resources/JGI/genomes/download_jgi_transcripts.{portal}.log"
params:
tmpdir = "$TMPDIR/{portal}",
xml = "$TMPDIR/{portal}/files.xml",
output = "$TMPDIR/{portal}/transcripts.fna.gz"
retries: 3
shell:
"""
mkdir -p {params.tmpdir}
python source/utils/download_jgi_transcripts.py -p {wildcards.portal} -c {input.cookies} -o {params.output} 2>{log}
mv {params.output} {output}
rm -rf {params.tmpdir}
"""

rule concat_transcripts:
"""
Concatenates all transcripts downloaded
"""
output:
"resources/fungi/fungi_transcripts.fasta.gz"
input:
expand(rules.download_jgi_transcripts.output, portal=genomes.index.tolist())
log:
"resources/fungi/concat_transcripts.log"
shell:
"""
curl -L -s -o {output[0]}.gz {params.url}
gunzip {output[0]}.gz
cat {input} > {output}
"""

## Host data ##
Expand Down
11 changes: 7 additions & 4 deletions source/rules/filter.smk
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,29 @@ include: "paired_strategy.smk"

rule bowtie_build_fungi:
input:
"resources/fungi/fungi_transcripts.fasta"
rules.concat_transcripts.output
output:
expand("resources/fungi/fungi_transcripts.fasta.{index}.bt2l",
index=range(1,5))
log:
"resources/fungi/bowtie2.log"
params:
tmp = "$TMPDIR/fungi_transcripts/fungi_transcripts.fasta",
fasta = "$TMPDIR/fungi_transcripts/fungi_transcripts.fasta",
tmpdir = "$TMPDIR/fungi_transcripts",
outdir = "resources/fungi/"
threads: 20
resources:
runtime = lambda wildcards, attempt: attempt**2*60*120
conda: "../../envs/bowtie2.yaml"
shell:
"""
mkdir -p {params.tmpdir}
gunzip -c {input} > {params.fasta}
bowtie2-build \
--threads {threads} \
--large-index {input} \
{params.tmp} >{log} 2>&1
mv {params.tmp}*.bt2l {params.outdir}/
{params.fasta} >{log} 2>&1
mv {params.fasta}*.bt2l {params.outdir}/
rm -r {params.tmpdir}
"""

Expand Down Expand Up @@ -83,6 +85,7 @@ rule bowtie_build_host:
"resources/host/bowtie2.log"
resources:
runtime = lambda wildcards, attempt: attempt**2*60
conda: "../../envs/bowtie2.yaml"
shell:
"""
bowtie2-build \
Expand Down
105 changes: 105 additions & 0 deletions source/utils/download_jgi_transcripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python

import xml.etree.ElementTree as ET
import pycurl
from datetime import datetime
import pytz
import tempfile
import os
from argparse import ArgumentParser
import sys


def tz_convert(timestamp):
"""
Converts timestamp to UTC
"""
timezones = {
"PDT": "America/Los_Angeles",
"PST": "America/Los_Angeles"
}
tz = timestamp.split(" ")[-2]
timestamp = " ".join(timestamp.split(" ")[0:-2]+timestamp.split(" ")[-1:])
t = datetime.strptime(timestamp, "%a %b %d %H:%M:%S %Y")
tz = pytz.timezone(timezones[tz])
return tz.localize(t).astimezone(pytz.utc)

def get_xml(portal, cookie):
"""
Get XML tree from JGI portal
The xml file is stored in a temporary file and read into an ElementTree object
"""
url=f"https://genome-downloads.jgi.doe.gov/portal/ext-api/downloads/get-directory?organism={portal}"
with tempfile.TemporaryFile() as fp:
c = pycurl.Curl()
c.setopt(c.COOKIEFILE, cookie)
c.setopt(c.URL, url)
c.setopt(c.WRITEDATA, fp)
c.setopt(c.VERBOSE, False)
c.perform()
c.close()
fp.seek(0)
tree = ET.fromstring(fp.read())
return tree

def find_files(root):
"""
Find files in the XML tree
Searches for the latest transcript file in the folder "Filtered Models ("best")/Transcripts"
"""
f = ""
for child in root:
if child.tag == "folder" and child.attrib["name"] == "Files":
for item in child:
if item.tag == "folder" and item.attrib["name"] == "Annotation":
for annot_item in item:
if annot_item.tag == "folder" and annot_item.attrib["name"] == 'Filtered Models ("best")':
for filtered_item in annot_item:
if filtered_item.tag == "folder" and filtered_item.attrib["name"] == "Transcripts":
timestamp = False
for transcript_item in filtered_item:
if transcript_item.tag == "file":
filename=transcript_item.attrib["filename"]
if "transcripts" in filename and filename.endswith(".nt.fasta.gz"):
url = transcript_item.attrib["url"]
_timestamp = tz_convert(transcript_item.attrib["timestamp"])
if timestamp and _timestamp>timestamp:
sys.stderr.write(f"Found newer file {filename} with timestamp {_timestamp}\n")
f = url
elif not timestamp:
sys.stderr.write(f"Found file {filename} with timestamp {_timestamp}\n")
f = url
timestamp = _timestamp
break
return f

def main(args):
"""
Main function
"""
root = get_xml(args.portal, args.cookie)
f = find_files(root)
url = f"{args.base}{f}"
if not args.outfile:
outfile = os.path.basename(f)
else:
outfile = args.outfile
sys.stderr.write(f"Downloading {os.path.basename(f)} to {outfile}\n")
with open(outfile, 'wb') as fhout:
c = pycurl.Curl()
c.setopt(c.COOKIEFILE, args.cookie)
c.setopt(c.URL, url)
c.setopt(c.WRITEDATA, fhout)
c.setopt(c.VERBOSE, False)
c.perform()
c.close()


if __name__ == '__main__':
parser = ArgumentParser(description='Download all genomes from JGI Mycocosm')
parser.add_argument('-p', '--portal', help='Portal shorthand name (e.g. Aaoar1)', required=True)
parser.add_argument('-c', '--cookie', help="Cookie file for JGI", )
parser.add_argument('-o', '--outfile', help='Output file name. If not given, the file will be saved in the current directory with the remote filename')
parser.add_argument('-b', '--base', help='Base URL for JGI downloads (default: https://genome-downloads.jgi.doe.gov)', default="https://genome-downloads.jgi.doe.gov")
args = parser.parse_args()
main(args)
Loading

0 comments on commit d1e7b87

Please sign in to comment.