diff --git a/.gitignore b/.gitignore index b2592a6..5df9590 100644 --- a/.gitignore +++ b/.gitignore @@ -139,4 +139,7 @@ slurm/err slurm/logs tmp .Rproj.user -*.Rproj \ No newline at end of file +*.Rproj +.quarto +*/libs +*.html \ No newline at end of file diff --git a/README.md b/README.md index 3f52c95..547aeb5 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,32 @@ Here's an example of running up to and including the filtering part of the workf snakemake --use-conda -j 10 -p filter ``` +##### Configuring login to JGI server + +The workflow automatically downloads transcript data from the [JGI Mycocosm](https://mycocosm.jgi.doe.gov/fungi/fungi.info.html) database. For this to work you must have [registered an account](https://contacts.jgi.doe.gov/registration/new). After registering, create a file in YAML format containing your login credentials: + +```yaml +jgi_password: +jgi_user: +``` + +Save this file as `jgi_login.yml` and pass it to the workflow with the `--configfile` flag: + +```bash +snakemake --configfile jgi_login.yml +``` + +Information about genomes found in the JGI Mycocosm database will be stored in `resources/JGI/genomes.tsv`. This file will be available after a dry-run of the workflow so you may inspect it before running the workflow. The file should look like this: + +| portal | Name | bp | genes | +|--------|------|----|------| +| Aaoar1 | Aaosphaeria arxii CBS 175.79 v1.0 | 38901049 | 14203 | +| Abobi1 | Abortiporus biennis CCBS 521 v1.0 | 45165060 | 11987 | +| Abobie1 | Abortiporus biennis ​CIRM-BRFM 1778 v1.0 | 33118568 | 11767 | + +The `portal` column contains the JGI portal name, `Name` is the name of the genome, `bp` is the size of the genome in base pairs and `genes` is the number of genes in the genome. You may filter this file to your liking to only download transcripts for the genomes you are interested in. Remember to save it as `resources/JGI/genomes.tsv` before running the workflow. + + #### Assembly Assemblies can be generated for single samples or by combining multiple samples diff --git a/Snakefile b/Snakefile index 7dc705e..d6ef5c0 100644 --- a/Snakefile +++ b/Snakefile @@ -6,7 +6,7 @@ import platform from source.utils.parse import parse_sample_list from snakemake.utils import min_version, validate from snakemake.exceptions import WorkflowError - +from source.utils.list_jgi_genomes import parse_genomes def parse_validation_error(e): """ @@ -33,11 +33,6 @@ if not os.getenv("TMPDIR"): os.environ["TMPDIR"] = "tmp" os.makedirs(os.environ["TMPDIR"],exist_ok=True) -wildcard_constraints: - sample_id = "[A-Za-z0-9_\-\.]+", - assembler = "megahit|trinity|transabyss", - filter_source = "unfiltered|filtered|taxmapper|bowtie2" - # Set default config and validate against schema if os.path.exists("config.yml"): configfile: "config.yml" @@ -49,14 +44,23 @@ except WorkflowError as e: workdir: config["workdir"] +# Parse samples and assemblies +samples, map_dict, assemblies = parse_sample_list(config["sample_file_list"], config) +# Parse JGI genomes from fungi info url, also save to file for quick re-use in future runs +genomes = parse_genomes(config["fungi_info"], f="resources/JGI/genomes.tsv") + +wildcard_constraints: + sample_id = f"({'|'.join(list(samples.keys()))})", + assembler = "megahit|trinity|transabyss", + filter_source = "unfiltered|filtered|taxmapper|bowtie2", + portals = f"({'|'.join(list(genomes.index.tolist()))})", + # Get environment info pythonpath = sys.executable envdir = '/'.join(pythonpath.split("/")[0:-2]) system = platform.system() config["system"] = system -# Parse samples and assemblies -samples, map_dict, assemblies = parse_sample_list(config["sample_file_list"], config) # Include rules include: "source/rules/db.smk" diff --git a/config.yml b/config.yml index fa4ab7f..ed88e03 100644 --- a/config.yml +++ b/config.yml @@ -5,7 +5,7 @@ sample_file_list: "sample_list.tsv" datadir: "data/raw" mem_per_cpu: 6 ## data for filtering -fungi_url: "https://zenodo.org/record/3550762/files/fungi_transcripts.fasta.gz?download=1" +fungi_info: "https://mycocosm.jgi.doe.gov/fungi/fungi.info.html" host_fna_url: "https://zenodo.org/record/3550762/files/spruce.fna.gz?download=1" host_gtf_url: "" ## mapping diff --git a/config/config.schema.yaml b/config/config.schema.yaml index a131924..02ad98c 100644 --- a/config/config.schema.yaml +++ b/config/config.schema.yaml @@ -84,10 +84,10 @@ properties: 'both_mapped' = requires that both ends are mapped or 'one_mapped' = keep paired reads even if only one end maps" default: "both_mapped" enum: ["concordant","both_mapped","one_mapped"] - fungi_transcript_url: + fungi_info: type: string - description: Dataset url for fungal transcripts to use in filtering - default: "https://zenodo.org/record/3550762/files/fungi_transcripts.fasta.gz?download=1" + description: URL to fungi info page + default: https://mycocosm.jgi.doe.gov/fungi/fungi.info.html spruce_transcript_url: type: string description: Dataset url for spruce transcripts to use in filtering diff --git a/environment.yml b/environment.yml index 01aebe4..9d5b144 100644 --- a/environment.yml +++ b/environment.yml @@ -1,13 +1,16 @@ name: fungal-trans channels: - - bioconda - conda-forge + - bioconda - defaults dependencies: - - python=3.8 - - snakemake-minimal=6.2.1 - - biopython=1.78 - - pandas=1.3.4 + - python + - snakemake-minimal + - biopython + - pandas + - lxml + - pycurl + - tqdm - multiqc=1.11 - bowtie2=2.4.4 - samtools=1.14 diff --git a/envs/bowtie2.yaml b/envs/bowtie2.yaml new file mode 100644 index 0000000..300998b --- /dev/null +++ b/envs/bowtie2.yaml @@ -0,0 +1,8 @@ +name: bowtie2 +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bowtie2=2.5.3 + - samtools=1.20 \ No newline at end of file diff --git a/local/config.yaml b/local/config.yaml new file mode 100644 index 0000000..a76717f --- /dev/null +++ b/local/config.yaml @@ -0,0 +1,4 @@ +use-conda: True +keep-going: True +rerun-triggers: mtime +cores: 1 \ No newline at end of file diff --git a/source/rules/annotate_co.smk b/source/rules/annotate_co.smk index e6fd9ae..3d17aa8 100644 --- a/source/rules/annotate_co.smk +++ b/source/rules/annotate_co.smk @@ -371,7 +371,7 @@ rule create_blob_co: conda: "../../envs/blobtools.yaml" shell: """ - blobtools create --nodes {input.nodes} --names {input.names} -i {input.fa} -b {input.bam} -t {input.hit} \ + blobtools create --nodes {input.nodes} --names {input.names} -i {input.fa} -b {input.bam} -t {input.hit} \ --title {wildcards.sample_id} -o {params.prefix} """ diff --git a/source/rules/db.smk b/source/rules/db.smk index 8c989f3..0d2e179 100644 --- a/source/rules/db.smk +++ b/source/rules/db.smk @@ -6,7 +6,8 @@ localrules: get_kegg_files, download_refseq_db, download_host, - download_fungi_transcripts, + init_jgi, + download_jgi_transcripts, prepare_diamond_JGI ## NCBI Taxonomy ## @@ -112,18 +113,62 @@ rule press_dbCAN: """ ## Fungal transcripts ## -rule download_fungi_transcripts: +rule init_jgi: """ - Downloads snapshot of fungal transcripts obtained from JGI + Creates a cookie file for JGI authentication using user-defined credentials """ output: - temp("resources/fungi/fungi_transcripts.fasta") + cookies="resources/JGI/cookies" + log: + "resources/JGI/init_jgi.log" + shadow: "minimal" params: - url = config["fungi_url"] + user_name = config["jgi_user"], + password = config["jgi_password"] + retries: 3 + shell: + """ + curl 'https://signon.jgi.doe.gov/signon/create' \ + --data-urlencode 'login={params.user_name}' \ + --data-urlencode 'password={params.password}' -c {output.cookies} > /dev/null 2>{log} + """ + +rule download_jgi_transcripts: + """ + For each 'portal', download the filtered transcripts + """ + output: + temp("resources/JGI/genomes/{portal}.transcripts.fna.gz"), + input: + cookies=rules.init_jgi.output.cookies, + log: + "resources/JGI/genomes/download_jgi_transcripts.{portal}.log" + params: + tmpdir = "$TMPDIR/{portal}", + xml = "$TMPDIR/{portal}/files.xml", + output = "$TMPDIR/{portal}/transcripts.fna.gz" + retries: 3 + shell: + """ + mkdir -p {params.tmpdir} + python source/utils/download_jgi_transcripts.py -p {wildcards.portal} -c {input.cookies} -o {params.output} 2>{log} + mv {params.output} {output} + rm -rf {params.tmpdir} + """ + +rule concat_transcripts: + """ + Concatenates all transcripts downloaded + """ + output: + "resources/fungi/fungi_transcripts.fasta.gz" + input: + expand(rules.download_jgi_transcripts.output, portal=genomes.index.tolist()) + log: + "resources/fungi/concat_transcripts.log" shell: """ - curl -L -s -o {output[0]}.gz {params.url} - gunzip {output[0]}.gz + cat {input} > {output} """ ## Host data ## diff --git a/source/rules/filter.smk b/source/rules/filter.smk index 7608483..fc423d1 100644 --- a/source/rules/filter.smk +++ b/source/rules/filter.smk @@ -25,27 +25,29 @@ include: "paired_strategy.smk" rule bowtie_build_fungi: input: - "resources/fungi/fungi_transcripts.fasta" + rules.concat_transcripts.output output: expand("resources/fungi/fungi_transcripts.fasta.{index}.bt2l", index=range(1,5)) log: "resources/fungi/bowtie2.log" params: - tmp = "$TMPDIR/fungi_transcripts/fungi_transcripts.fasta", + fasta = "$TMPDIR/fungi_transcripts/fungi_transcripts.fasta", tmpdir = "$TMPDIR/fungi_transcripts", outdir = "resources/fungi/" threads: 20 resources: runtime = lambda wildcards, attempt: attempt**2*60*120 + conda: "../../envs/bowtie2.yaml" shell: """ mkdir -p {params.tmpdir} + gunzip -c {input} > {params.fasta} bowtie2-build \ --threads {threads} \ --large-index {input} \ - {params.tmp} >{log} 2>&1 - mv {params.tmp}*.bt2l {params.outdir}/ + {params.fasta} >{log} 2>&1 + mv {params.fasta}*.bt2l {params.outdir}/ rm -r {params.tmpdir} """ @@ -83,6 +85,7 @@ rule bowtie_build_host: "resources/host/bowtie2.log" resources: runtime = lambda wildcards, attempt: attempt**2*60 + conda: "../../envs/bowtie2.yaml" shell: """ bowtie2-build \ diff --git a/source/utils/download_jgi_transcripts.py b/source/utils/download_jgi_transcripts.py new file mode 100644 index 0000000..38f522a --- /dev/null +++ b/source/utils/download_jgi_transcripts.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python + +import xml.etree.ElementTree as ET +import pycurl +from datetime import datetime +import pytz +import tempfile +import os +from argparse import ArgumentParser +import sys + + +def tz_convert(timestamp): + """ + Converts timestamp to UTC + """ + timezones = { + "PDT": "America/Los_Angeles", + "PST": "America/Los_Angeles" + } + tz = timestamp.split(" ")[-2] + timestamp = " ".join(timestamp.split(" ")[0:-2]+timestamp.split(" ")[-1:]) + t = datetime.strptime(timestamp, "%a %b %d %H:%M:%S %Y") + tz = pytz.timezone(timezones[tz]) + return tz.localize(t).astimezone(pytz.utc) + +def get_xml(portal, cookie): + """ + Get XML tree from JGI portal + The xml file is stored in a temporary file and read into an ElementTree object + """ + url=f"https://genome-downloads.jgi.doe.gov/portal/ext-api/downloads/get-directory?organism={portal}" + with tempfile.TemporaryFile() as fp: + c = pycurl.Curl() + c.setopt(c.COOKIEFILE, cookie) + c.setopt(c.URL, url) + c.setopt(c.WRITEDATA, fp) + c.setopt(c.VERBOSE, False) + c.perform() + c.close() + fp.seek(0) + tree = ET.fromstring(fp.read()) + return tree + +def find_files(root): + """ + Find files in the XML tree + Searches for the latest transcript file in the folder "Filtered Models ("best")/Transcripts" + """ + f = "" + for child in root: + if child.tag == "folder" and child.attrib["name"] == "Files": + for item in child: + if item.tag == "folder" and item.attrib["name"] == "Annotation": + for annot_item in item: + if annot_item.tag == "folder" and annot_item.attrib["name"] == 'Filtered Models ("best")': + for filtered_item in annot_item: + if filtered_item.tag == "folder" and filtered_item.attrib["name"] == "Transcripts": + timestamp = False + for transcript_item in filtered_item: + if transcript_item.tag == "file": + filename=transcript_item.attrib["filename"] + if "transcripts" in filename and filename.endswith(".nt.fasta.gz"): + url = transcript_item.attrib["url"] + _timestamp = tz_convert(transcript_item.attrib["timestamp"]) + if timestamp and _timestamp>timestamp: + sys.stderr.write(f"Found newer file {filename} with timestamp {_timestamp}\n") + f = url + elif not timestamp: + sys.stderr.write(f"Found file {filename} with timestamp {_timestamp}\n") + f = url + timestamp = _timestamp + break + return f + +def main(args): + """ + Main function + """ + root = get_xml(args.portal, args.cookie) + f = find_files(root) + url = f"{args.base}{f}" + if not args.outfile: + outfile = os.path.basename(f) + else: + outfile = args.outfile + sys.stderr.write(f"Downloading {os.path.basename(f)} to {outfile}\n") + with open(outfile, 'wb') as fhout: + c = pycurl.Curl() + c.setopt(c.COOKIEFILE, args.cookie) + c.setopt(c.URL, url) + c.setopt(c.WRITEDATA, fhout) + c.setopt(c.VERBOSE, False) + c.perform() + c.close() + + +if __name__ == '__main__': + parser = ArgumentParser(description='Download all genomes from JGI Mycocosm') + parser.add_argument('-p', '--portal', help='Portal shorthand name (e.g. Aaoar1)', required=True) + parser.add_argument('-c', '--cookie', help="Cookie file for JGI", ) + parser.add_argument('-o', '--outfile', help='Output file name. If not given, the file will be saved in the current directory with the remote filename') + parser.add_argument('-b', '--base', help='Base URL for JGI downloads (default: https://genome-downloads.jgi.doe.gov)', default="https://genome-downloads.jgi.doe.gov") + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/source/utils/list_jgi_genomes.py b/source/utils/list_jgi_genomes.py new file mode 100644 index 0000000..e2cd094 --- /dev/null +++ b/source/utils/list_jgi_genomes.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +import pandas as pd +import numpy as np +import xml.etree.ElementTree as ET +import os +import sys +from argparse import ArgumentParser + +def parse_genomes(url, f = None): + """ + Parses available genome portals at JGI (default: https://mycocosm.jgi.doe.gov/fungi/fungi.info.html) + + :param url: str + :return: dict + """ + def extract_name_portal(genome_table): + df = {} + for _, items in genome_table.iterrows(): + name, portal = items["Name"] + if name == "Name": + continue + asm_length, _ = items["Assembly Length"] + genes, _ = items["# Genes"] + if asm_length == "NO DATA": + asm_length = np.nan + else: + asm_length = int(asm_length.replace(",","")) + if genes == "NO DATA": + genes = np.nan + else: + genes = int(genes.replace(",","")) + df[portal.lstrip("/")] = {'Name': name, 'bp': asm_length, 'genes': genes} + df = pd.DataFrame(df).T + df.index.name = "portal" + return df + if f is not None and os.path.exists(f): + genomes = pd.read_csv(f, sep="\t", header=0, index_col=0) + return genomes + genome_table = pd.read_html(url, extract_links="body", header=None)[0] + genome_table.columns = ["##","Name","Assembly Length","# Genes", "Published"] + genome_table.set_index("##", inplace=True) + genomes = extract_name_portal(genome_table) + if f is not None: + os.makedirs(os.path.dirname(f), exist_ok=True) + genomes.to_csv(f, sep="\t") + else: + with sys.stdout as f: + genomes.to_csv(f, sep="\t") + return genomes + + +def main(args): + """ + Main function + """ + genomes = parse_genomes(args.info, args.file) + + +if __name__ == '__main__': + parser = ArgumentParser(description='Download a table of all available genomes from JGI Mycocosm') + parser.add_argument('-i', '--info', help='Info file at JGI Mycocosm (default: https://mycocosm.jgi.doe.gov/fungi/fungi.info.html)', default="https://mycocosm.jgi.doe.gov/fungi/fungi.info.html") + parser.add_argument('-f', '--file', help='Output file name. Will print to stdout if not provided.') + args = parser.parse_args() + main(args) \ No newline at end of file