Skip to content

Commit

Permalink
Merge pull request #139 from jaleezyy/nextclade3
Browse files Browse the repository at this point in the history
Nextclade v3+ Support
  • Loading branch information
jaleezyy authored Jan 26, 2024
2 parents 2c47201 + b8f8954 commit 2106195
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 87 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ positional arguments:
postprocess Run SIGNAL postprocessing on completed SIGNAL run. '--configfile' is required but will be generated if '--directory' is provided
ncov_tools Generate configuration file and filesystem setup required and then execute ncov-tools quality control assessment. Requires 'ncov-tools' submodule! '--configfile' is required
but will be generated if '--directory' is provided
install Install individual rule environments and ensure SIGNAL is functional. The only parameter operable will be '--data'. Will override other operations!
install Install individual rule environments and ensure SIGNAL is functional. The only parameters operable will be '--data' and '--unlock'. Will override other operations!
optional arguments:
-h, --help show this help message and exit
Expand Down
1 change: 1 addition & 0 deletions conda_envs/postprocessing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ dependencies:
- matplotlib
- numpy
- pandas
- pyarrow
10 changes: 6 additions & 4 deletions example_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,15 @@ pango-designation:
pangolin-data:

# Versions for Nextclade (software & datasets)
# Software version (nextclade) should use numbers only (i.e., 1.11.0)
# Be as specific as possible with the desired dataset tag (nextclade-data). Can accept dates (YYYY-mm-dd) alone, but will assume corresponding timestamp (HH:MM:SS)
# Typical tag format is YYYY-mm-ddTHH:MM:SSZ
# nextclade: Software version. Input should use numbers only (i.e., 2.14.0)
# nextclade-data: The nextclade dataset tag. Refer to available nextclade datasets. Accepted tag format is 'YYYY-mm-ddTHH:MM:SSZ'
# Be as specific as possible with the desired dataset tag. Can accept dates (YYYY-mm-dd) alone, but will assume corresponding timestamp (HH:MM:SS). SIGNAL will automatically adjust between v2 and v3 dataset tag formats
# Leave blank for latest versions
# Setting nextclade-include-recomb to False will download the recombinant-sequence free version of the Nextclade database
nextclade:
nextclade-data:

# Nextclade v2 only
# nextclade-include-recomb: set to False will download the recombinant-sequence free version of the nextclade dataset
nextclade-include-recomb: True

# ANYTHING BELOW IS ONLY NEEDED IF USING NCOV-TOOLS SUMMARIES
Expand Down
203 changes: 132 additions & 71 deletions scripts/assign_lineages.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ def check_file(path: str) -> Path:
else:
raise argparse.ArgumentTypeError(f"{path} can't be read")

def check_frontend():
try:
subprocess.check_call(['mamba', 'list'], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
return 'mamba'
except subprocess.CalledProcessError:
return 'conda'

def update_latest_pangolin():
"""
Ensure pangolin is updated to the latest release
Expand All @@ -40,80 +47,140 @@ def update_pangolin(vers):
script = os.path.join(script_dir, "pangolin_specific_version_update.py")
subprocess.run([script, '--versions_file', vers])

def update_nextclade():
def nextclade_tag(dir, version):
"""
Pull tag value for Nextclade datasets. Use last_known_ver to determine whether tag.json or pathogens.json should be prioritized. Remove any residual files of the other dataset version to minimize conflicts.
"""
last_known_ver = version.split(".")[0]
v2 = ['tag.json', 'primers.csv', 'qc.json', 'virus_properties.json', 'genemap.gff']
v3 = ['pathogens.json', 'genome_annotation.gff3', 'README.md', 'CHANGELOG.md']

# NextClade Dataset V2
if os.path.exists(os.path.join(dir, 'tag.json')) and int(last_known_ver) < 3:
j = open(os.path.join(dir, 'tag.json'))
data = json.load(j)
tag = data['tag']
j.close()
# clean up residual V3 files not overwritten
for file in v3:
if os.path.exists(os.path.join(dir, file)):
os.remove(os.path.join(dir, file))
# NextClade Dataset V3+
elif os.path.exists(os.path.join(dir, 'pathogen.json')) and int(last_known_ver) >= 3:
j = open(os.path.join(dir, 'pathogen.json'))
data = json.load(j)
tag = data['version']['tag']
j.close()
# clean up residual V2 files not overwritten
for file in v2:
if os.path.exists(os.path.join(dir, file)):
os.remove(os.path.join(dir, file))
else:
# no file containing a tag could be found: incomplete or non-existant
tag = None

return tag

def nextclade_dataset_flags(dataset, output, tag="None", version='3'):
"""
DEPRECIATED!
Ensure nextclade is updated to the latest release
Generate the necessary flags for a nextclade dataset get, based on existing Nextclade installation
"""
subprocess.check_output(["npm", "install", "-g", "@neherlab/nextclade"])
cmd = [f"nextclade dataset get"]
params = {'name': dataset,
'reference': 'MN908947',
'tag': tag,
'output-dir': output
}
if int(version.split(".")[0]) > 2:
del params['reference']

if params['tag'] is None or params['tag'] == "None":
del params['tag']

for p in params:
cmd.append(f"--{p} {params[p]}")

return " ".join(cmd)

def update_nextclade_dataset(vers, skip):
"""
Ensure nextclade dataset is updated to the latest dataset, placed within scripts.
Reference accession will be set by params.accession (viral_reference_contig_name).
"""
frontend = check_frontend()
output_dir = os.path.join(os.path.dirname(sys.argv[0]), 'nextclade')
nextclade_version = subprocess.run(f"nextclade --version".split(),
nc_version = subprocess.run(f"nextclade --version".split(),
stdout=subprocess.PIPE).stdout.decode('utf-8').strip().lower()
if nextclade_version.startswith("nextclade"):
nextclade_version = nextclade_version.split()[1]
if nc_version.startswith("nextclade"):
nc_version = nc_version.split()[1]
current_tag = nextclade_tag(output_dir, nc_version) # assign current tag to existing version (last known version and thus dataset should likely match)

if skip or (vers is None):
return output_dir, nextclade_version
return output_dir, nc_version
if not os.path.exists(output_dir):
os.mkdir(output_dir)
with open(vers) as fh:
line = fh.readlines()
assert len(line) == 3 # should only be nextclade, nextclade-data, and recomb
assert len(line) == 3
# should only be nextclade, nextclade-data (i.e., tag), and recomb (version <3)
software_ver = str(line[0].split(":", 1)[1]).strip().lstrip('v')
requested_ver = str(line[1].split(":", 1)[1]).strip()
recomb = eval(str(line[2].split(":")[1]).strip())

# check current version of nextclade, if failed, we stick with the latest
# search conda for latest version, this will be the default
### check current version of nextclade, if failed, we stick with the latest
# search mamba/conda for latest version, this will be the default
# PackagesNotFoundError for invalid version
print("\n## Existing nextclade install:")
print("Nextclade: " + nextclade_version + "\n")
print("Nextclade: " + nc_version + "\n")
print("## Changing installed versions as needed:")
try:
if software_ver != "None": # specific version requested, check if available
try:
softrequest = subprocess.check_output(f"conda search -c bioconda -f nextclade", shell=True).split()[-3].strip().decode('utf-8')
# check if already installed
if softrequest == nextclade_version:
# pull specified version
softrequest = subprocess.check_output(f"{frontend} search -c bioconda -f nextclade={software_ver}", shell=True).split()[-3].strip().decode('utf-8')
# check if requested version is already installed
if softrequest == nc_version:
print(f"Nextclade {softrequest} already installed! Skipping update!")
else:
print(f"Changing Nextclade from {nextclade_version} to {softrequest}!")
subprocess.run(f"conda install -q -y -c bioconda nextclade={softrequest}", shell=True, check=True)
print(f"Changing Nextclade from {nc_version} to {softrequest}!")
subprocess.run(f"{frontend} install -q -y -c bioconda nextclade={softrequest}", shell=True, check=True)
except subprocess.CalledProcessError:
print("Cannot find version requested, will ensure latest version!")
softrequest = subprocess.check_output(f"conda search -c bioconda -f nextclade", shell=True).split()[-3].strip().decode('utf-8')
# check if already installed
if softrequest == nextclade_version:
softrequest = subprocess.check_output(f"{frontend} search -c bioconda -f nextclade", shell=True).split()[-3].strip().decode('utf-8')
# check if latest already installed
if softrequest == nc_version:
print(f"Nextclade {softrequest} already installed! Skipping update!")
else:
subprocess.run(f"conda install -q -y -c bioconda nextclade={softrequest}", shell=True, check=True)
subprocess.run(f"{frontend} install -q -y -c bioconda nextclade={softrequest}", shell=True, check=True)
else:
print(f"Installing latest version of Nextclade!")
softrequest = subprocess.check_output(f"conda search -c bioconda -f nextclade", shell=True).split()[-3].strip().decode('utf-8')
if softrequest == nextclade_version:
softrequest = subprocess.check_output(f"{frontend} search -c bioconda -f nextclade", shell=True).split()[-3].strip().decode('utf-8')
# check if latest is already installed
if softrequest == nc_version:
print(f"Nextclade {softrequest} already installed! Skipping update!")
else:
subprocess.run(f"conda install -q -y -c bioconda nextclade={softrequest}", shell=True, check=True)
print(f"Installing latest version of Nextclade!")
subprocess.run(f"{frontend} install -q -y -c bioconda nextclade={softrequest}", shell=True, check=True)
except subprocess.CalledProcessError:
print(f"Something went wrong updating Nextclade! Skipping update!")

updated_nc = subprocess.run(f"nextclade --version".split(),
stdout=subprocess.PIPE).stdout.decode('utf-8').strip().lower()
if updated_nc.startswith("nextclade"):
updated_nc = updated_nc.split()[1]

# check nextclade_ver, if None, assign today's date
### check nextclade_ver (i.e., tag date), if None, assign today's date
try:
if requested_ver != "None":
# assume yyyy-mm-dd (so only yyyy and mm expected to stay ccnsistent)
# assume yyyy-mm-dd (so only yyyy and mm expected to stay consistent)
submitted = requested_ver.split("-")
submitted_date = [s.strip() for s in submitted]
assert len(submitted_date) == 3
year = str(submitted_date[0])
month = str(submitted_date[1])
if (len(submitted_date[2].split(" ")) == 2) or (len(submitted_date[2].split("T")) == 2): # date and time part of provided tag
if submitted_date[2].count("T") == 1: # only applies if starting input was in quotations itself in the config file
if (len(submitted_date[2].split(" ")) == 2) or (len(submitted_date[2].split("T")) == 2):
# date and time part of provided tag
if submitted_date[2].count("T") == 1:
# only applies if starting input was in quotations itself in the config file
day = str(submitted_date[2]).split("T")[0].strip()
timestamp = str(submitted_date[2]).split("T")[1].split("+", 1)[0].split(":")
else:
Expand All @@ -123,88 +190,84 @@ def update_nextclade_dataset(vers, skip):
else: # only a date provided, assume timestamp
day = str(submitted_date[2])
tags = ["12", "00", "00"]
requested = str("%s-%s-%sT%s:%s:%sZ" %(year, month, day, tags[0], tags[1], tags[2]))

if int(updated_nc.split(".")[0]) > 2:
# NextClade V3+ conversion
requested = str("%s-%s-%s--%s-%s-%sZ" %(year, month, day, tags[0], tags[1], tags[2]))
else:
# NextClade V2 conversion
requested = str("%s-%s-%sT%s:%s:%sZ" %(year, month, day, tags[0], tags[1], tags[2]))
else:
requested = None
except (AssertionError, TypeError, ValueError): # some other input that isn't in yyyy-mm-dd date format
except (AssertionError, TypeError, ValueError):
# some other input that isn't in yyyy-mm-dd date format
print(f"\nProvided Nextclade dataset tag invalid! Downloading latest...")
requested = None

if recomb:
dataset = 'sars-cov-2'
else:
### recomb only relevent for NextClade V2
if not recomb and int(final_software_version) < 3: # nextclade <=v2
dataset = 'sars-cov-2-no-recomb'
else:
dataset = 'sars-cov-2'

# If specific tag requested, attempt to install, otherwise install latest
accession = 'MN908947'
current_tag = None
if os.path.exists(os.path.join(output_dir, 'tag.json')):
j = open(os.path.join(output_dir, 'tag.json'))
data = json.load(j)
current_tag = data['tag']
j.close()


# Generate the command for 'nextclade dataset get'
cmd = nextclade_dataset_flags(dataset, output_dir, requested, updated_nc)

if requested is not None:
# check existing database, if found
if requested == current_tag:
print(f"Nextclade dataset {requested} already installed! Skipping update!")
else:
try:
print(f"\nDownloading Nextclade {dataset} dataset tagged {requested} for reference {accession}!")
subprocess.run(f"nextclade dataset get "
f"--name '{dataset}' "
f"--reference '{accession}' "
f"--tag {requested} "
f"--output-dir '{output_dir}'", shell=True, check=True)
print(f"\nDownloading Nextclade dataset tagged {requested} for reference {dataset}!")
subprocess.run(cmd, shell=True, check=True)
except subprocess.CalledProcessError:
print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade {dataset} dataset for reference {accession}...")
print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade dataset for reference {dataset}...")
try:
subprocess.run(f"nextclade dataset get "
f"--name '{dataset}' "
f"--reference '{accession}' "
f"--output-dir '{output_dir}'", shell=True, check=True)
cmd = nextclade_dataset_flags(dataset, output_dir, version=updated_nc)
subprocess.run(cmd, shell=True, check=True)
except subprocess.CalledProcessError:
if current_tag is not None:
print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!")
print(f"Something went wrong updating the Nextclade dataset, using existing {current_tag} instead!")
requested = current_tag
else:
print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...")
requested = "Unknown"
else:
try:
print(f"\nDownloading latest Nextclade {dataset} dataset for reference {accession}!")
subprocess.run(f"nextclade dataset get "
f"--name '{dataset}' "
f"--reference '{accession}' "
f"--output-dir '{output_dir}'", shell=True, check=True)
print(f"\nDownloading latest Nextclade dataset for reference {dataset}!")
subprocess.run(cmd, shell=True, check=True)
except subprocess.CalledProcessError:
if current_tag is not None:
print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!")
print(f"Something went wrong updating the Nextclade dataset, using existing {current_tag} instead!")
requested = current_tag
else:
print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...")
requested = "Unknown"

# Obtain final version information for output
nextclade_version = subprocess.run(f"nextclade --version".split(), stdout=subprocess.PIPE).stdout.decode('utf-8').strip().lower()
if nextclade_version.startswith("nextclade"):
nextclade_version = nextclade_version.split()[1]
# Doubles as a cleanup step to avoid V2 and V3 tag mix-ups
if requested is None:
tag = nextclade_tag(output_dir, updated_nc)
today = datetime.today().strftime('%Y-%m-%d')
requested = f"Latest as of {today}"
requested = f"Latest as of {today}: {tag}"
with open('final_nextclade_versions.txt', 'w+') as out:
print("\n## Nextclade and datasets now:")
print("Nextclade: " + nextclade_version)
print("Reference: %s" %(accession))
print("Nextclade: " + updated_nc)
print("Reference: MN908947")
print("Dataset: %s" %(dataset))
print("Dataset version: %s" %(requested))
# Output to file
print("## Nextclade and datasets now:", file=out)
print("Nextclade: " + nextclade_version, file=out)
print("Reference: %s" %(accession), file=out)
print("Nextclade: " + updated_nc, file=out)
print("Reference: MN908947", file=out)
print("Dataset: %s" %(dataset), file=out)
print("Dataset version: %s" %(requested), file=out)

return output_dir, nextclade_version
return output_dir, updated_nc


def run_nextclade(input_genomes, dataset, threads, version):
Expand All @@ -216,7 +279,7 @@ def run_nextclade(input_genomes, dataset, threads, version):
# run altered commands for Nextclade v1 and v2+
if version.startswith("1"):
subprocess.check_output(f"nextclade -i {input_genomes} -j {threads} --input-dataset {dataset} -c {str(output_file)} --output-dir {output_dir}".split(), stderr=subprocess.DEVNULL)
elif version.startswith("2"):
elif version.startswith("2") or version.startswith("3"):
subprocess.check_output(f"nextclade run -j {threads} --input-dataset {dataset} -c {str(output_file)} {input_genomes}".split(), stderr=subprocess.DEVNULL)
else:
print("Unknown version of nextclade!")
Expand All @@ -226,8 +289,6 @@ def run_nextclade(input_genomes, dataset, threads, version):
nextclade_df = pd.read_csv(str(output_file), sep=";")

# get version information
#nextclade_version = subprocess.run(f"nextclade --version".split(), stdout=subprocess.PIPE)
#nextclade_version = nextclade_version.stdout.decode('utf-8').strip()
nextclade_df['nextclade_version'] = f"nextclade {version}"

# tidy up dataframe
Expand Down
7 changes: 5 additions & 2 deletions scripts/signal_postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@

assert long_git_id.startswith('$Id: ')
#short_git_id = long_git_id[5:12]
short_git_id = "v1.6.2"
short_git_id = "v1.6.3"

# Suppresses matplotlib warning (https://github.com/jaleezyy/covid-19-signal/issues/59)
# Creates a small memory leak, but it's nontrivial to fix, and won't be a practical concern!
plt.rcParams.update({'figure.max_open_warning': 0})
plt.style.use('seaborn-whitegrid')
try:
plt.style.use('seaborn-whitegrid')
except OSError: # styles need an update will finalize if matplotlib version hardcoded
plt.style.use('seaborn-v0_8-whitegrid')


######################## Helper functions/classes for text file parsing #######################
Expand Down
Loading

0 comments on commit 2106195

Please sign in to comment.