Skip to content

Commit

Permalink
Merge pull request #105 from veg/true-append
Browse files Browse the repository at this point in the history
true append initial commit
  • Loading branch information
stevenweaver authored Aug 12, 2024
2 parents b69b6f9 + 2b86719 commit 67e350d
Show file tree
Hide file tree
Showing 34 changed files with 425,474 additions and 13 deletions.
12,913 changes: 12,913 additions & 0 deletions examples/trueAppend/Network-Full.csv

Large diffs are not rendered by default.

25,824 changes: 25,824 additions & 0 deletions examples/trueAppend/Network-Full.fas

Large diffs are not rendered by default.

157,176 changes: 157,176 additions & 0 deletions examples/trueAppend/Network-Full.tn93.csv

Large diffs are not rendered by default.

801 changes: 801 additions & 0 deletions examples/trueAppend/Network-New-1.csv

Large diffs are not rendered by default.

1,600 changes: 1,600 additions & 0 deletions examples/trueAppend/Network-New-1.fas

Large diffs are not rendered by default.

2,471 changes: 2,471 additions & 0 deletions examples/trueAppend/Network-New-1.tn93.csv

Large diffs are not rendered by default.

1,596 changes: 1,596 additions & 0 deletions examples/trueAppend/Network-New-2.csv

Large diffs are not rendered by default.

3,190 changes: 3,190 additions & 0 deletions examples/trueAppend/Network-New-2.fas

Large diffs are not rendered by default.

10,039 changes: 10,039 additions & 0 deletions examples/trueAppend/Network-New-2.tn93.csv

Large diffs are not rendered by default.

2,391 changes: 2,391 additions & 0 deletions examples/trueAppend/Network-New-3.csv

Large diffs are not rendered by default.

4,780 changes: 4,780 additions & 0 deletions examples/trueAppend/Network-New-3.fas

Large diffs are not rendered by default.

23,551 changes: 23,551 additions & 0 deletions examples/trueAppend/Network-New-3.tn93.csv

Large diffs are not rendered by default.

3,186 changes: 3,186 additions & 0 deletions examples/trueAppend/Network-New-4.csv

Large diffs are not rendered by default.

6,370 changes: 6,370 additions & 0 deletions examples/trueAppend/Network-New-4.fas

Large diffs are not rendered by default.

43,009 changes: 43,009 additions & 0 deletions examples/trueAppend/Network-New-4.tn93.csv

Large diffs are not rendered by default.

401 changes: 401 additions & 0 deletions examples/trueAppend/Network-Old-1.csv

Large diffs are not rendered by default.

800 changes: 800 additions & 0 deletions examples/trueAppend/Network-Old-1.fas

Large diffs are not rendered by default.

3,404 changes: 3,404 additions & 0 deletions examples/trueAppend/Network-Old-1.tn93.csv

Large diffs are not rendered by default.

796 changes: 796 additions & 0 deletions examples/trueAppend/Network-Old-2.csv

Large diffs are not rendered by default.

1,590 changes: 1,590 additions & 0 deletions examples/trueAppend/Network-Old-2.fas

Large diffs are not rendered by default.

14,576 changes: 14,576 additions & 0 deletions examples/trueAppend/Network-Old-2.tn93.csv

Large diffs are not rendered by default.

1,191 changes: 1,191 additions & 0 deletions examples/trueAppend/Network-Old-3.csv

Large diffs are not rendered by default.

2,380 changes: 2,380 additions & 0 deletions examples/trueAppend/Network-Old-3.fas

Large diffs are not rendered by default.

30,129 changes: 30,129 additions & 0 deletions examples/trueAppend/Network-Old-3.tn93.csv

Large diffs are not rendered by default.

1,586 changes: 1,586 additions & 0 deletions examples/trueAppend/Network-Old-4.csv

Large diffs are not rendered by default.

3,170 changes: 3,170 additions & 0 deletions examples/trueAppend/Network-Old-4.fas

Large diffs are not rendered by default.

53,306 changes: 53,306 additions & 0 deletions examples/trueAppend/Network-Old-4.tn93.csv

Large diffs are not rendered by default.

12,913 changes: 12,913 additions & 0 deletions examples/trueAppend/QC-Large.csv

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions examples/trueAppend/QC-Small.csv

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions examples/trueAppend/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Synthetic datasets

Based off over 12,000 LANL pol sequences.

## Network data
`Network` denotes data that is used to either build or append to a network.

`New` or `Old` denotes thats that are either recent enough to trigger automated priority set creation or not, respectively.

Numbers `1`, `2`, `3` denote the order in which they should be used.

## Quality control

`QC` denotes data that is used for the homology screening.

## Both

`Small` datasets are useful for development and testing; should run in under a minute.

`Large` datasets will take longer but contain a more realistic amount of sequences.
75 changes: 62 additions & 13 deletions hivtrace/hivtrace.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import logging
import tempfile
import hivtrace.strip_drams as sd
import hivtrace.true_append as ta


class status:
Expand Down Expand Up @@ -51,6 +52,11 @@ def update_status(id, phase, status, msg=""):

logging.info(json.dumps(msg))

def fasta_to_dict(fasta_iterator):
fasta_dict = {}
for record in fasta_iterator:
fasta_dict[record.id] = str(record.seq)
return fasta_dict

def fasta_iter(fasta_name):
"""
Expand Down Expand Up @@ -280,7 +286,10 @@ def hivtrace(id,
handle_contaminants="remove",
skip_alignment=False,
save_intermediate=True,
prior=None
prior=None,
true_append=False,
prior_tn93=None,
prior_fasta=None
):
"""
PHASE 1) Pad sequence alignment to HXB2 length with bealign
Expand Down Expand Up @@ -511,25 +520,47 @@ def hivtrace(id,
# PHASE 4
update_status(id, phases.COMPUTE_TN93_DISTANCE, status.RUNNING)

with open(JSON_TN93_FN, 'w') as tn93_fh:
tn93_process = [
TN93DIST, '-q', '-0', '-o', OUTPUT_TN93_FN, '-t', threshold, '-a',
ambiguities, '-l', min_overlap, '-g', fraction
if ambiguities == 'resolve' else '1.0', '-f', OUTPUT_FORMAT,
OUTPUT_FASTA_FN
]
if true_append:
seqs_new = fasta_to_dict(SeqIO.parse(OUTPUT_FASTA_FN, format="fasta")) # Example function to parse new sequences
seqs_old = fasta_to_dict(SeqIO.parse(prior_fasta, format="fasta")) # Example function to parse old sequences

tn93_args = ['-q', '-0', '-t', threshold, '-a',
ambiguities, '-l', min_overlap, '-g', fraction
if ambiguities == 'resolve' else '1.0', '-f', OUTPUT_FORMAT
]

logging.debug(' '.join(tn93_process))
subprocess.check_call(tn93_process, stdout=tn93_fh, stderr=tn93_fh)
print(OUTPUT_TN93_FN)

ta.true_append(seqs_new=seqs_new,
seqs_old=seqs_old,
input_old_dists=prior_tn93,
output_dists=OUTPUT_TN93_FN,
tn93_args=tn93_args,
tn93_path=TN93DIST)

if OUTPUT_TN93_FN != DEST_TN93_FN:
shutil.copyfile(OUTPUT_TN93_FN, DEST_TN93_FN)

update_status(id, phases.COMPUTE_TN93_DISTANCE, status.COMPLETED)
else:
with open(JSON_TN93_FN, 'w') as tn93_fh:
tn93_process = [
TN93DIST, '-q', '-0', '-o', OUTPUT_TN93_FN, '-t', threshold, '-a',
ambiguities, '-l', min_overlap, '-g', fraction
if ambiguities == 'resolve' else '1.0', '-f', OUTPUT_FORMAT,
OUTPUT_FASTA_FN
]

logging.debug(' '.join(tn93_process))
subprocess.check_call(tn93_process, stdout=tn93_fh, stderr=tn93_fh)

if OUTPUT_TN93_FN != DEST_TN93_FN:
shutil.copyfile(OUTPUT_TN93_FN, DEST_TN93_FN)

update_status(id, phases.COMPUTE_TN93_DISTANCE, status.COMPLETED)

# raise an exception if tn93 file is empty
if is_tn93_file_empty(DEST_TN93_FN):
raise Exception(' '.join(tn93_process) + "returned empty file")
raise Exception("tn93 returned empty file")


# send contents of tn93 to status page
Expand Down Expand Up @@ -758,6 +789,9 @@ def main():
parser.add_argument('--log', help='Write logs to specified directory')
parser.add_argument('-o', '--output', help='Specify output filename')
parser.add_argument('-p', '--prior', help='Prior network configuration')
parser.add_argument('--true-append', required=False, action='store_true', help="Use a previous HIV-TRACE run to only compute tn93 on the differences")
parser.add_argument('-iD', '--input-old-dists', required=False, type=str, help="Input: Old TN93 distances (CSV)")
parser.add_argument('-iT', '--input-old-fasta', required=False, type=str, help="Input: Old Fasta (FASTA)")

args = parser.parse_args()

Expand All @@ -768,6 +802,10 @@ def main():

logging.basicConfig(filename=log_fn, level=logging.DEBUG)

if args.true_append:
if not args.input_old_dists or not args.input_old_fasta:
parser.error("--true-append requires --input_old_dists and --input_old_fasta to be set")

FN = args.input
OUTPUT_FN = args.input + '.results.json'

Expand All @@ -780,10 +818,18 @@ def main():
FRACTION = args.fraction
STRIP_DRAMS = args.strip_drams
PRIOR = None
PRIOR_TN93 = None
PRIOR_FASTA = None

if(args.prior):
PRIOR = args.prior

if(args.input_old_dists):
PRIOR_TN93 = args.input_old_dists

if(args.input_old_fasta):
PRIOR_FASTA = args.input_old_fasta

if args.output:
OUTPUT_FN = args.output

Expand All @@ -804,7 +850,10 @@ def main():
handle_contaminants=args.curate,
skip_alignment=args.skip_alignment,
save_intermediate=(not args.do_not_store_intermediate),
prior=PRIOR
prior=PRIOR,
true_append=args.true_append,
prior_tn93=PRIOR_TN93,
prior_fasta=PRIOR_FASTA
)

# Write to output filename if specified
Expand Down
1 change: 1 addition & 0 deletions hivtrace/hivtraceviz.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#! /usr/bin/env python3
"""
This module contains the command line interface for hivtrace.
"""
Expand Down
2 changes: 2 additions & 0 deletions hivtrace/strip_drams.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#! /usr/bin/env python3

# This PYTHON script will read in an aligned Fasta file (HIV prot/rt sequences) and remove
# DRAM (drug resistance associated mutation) codon sites. It will output a new alignment
# with these sites removed. It requires input/output file names along with the list of
Expand Down
211 changes: 211 additions & 0 deletions hivtrace/true_append.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
#! /usr/bin/env python3
'''
True Append for HIV-TRACE
'''

# imports
from contextlib import contextmanager
from csv import reader
from datetime import datetime
from gzip import open as gopen
from os import mkfifo, rmdir, unlink
from os.path import isfile
from subprocess import run
from sys import argv, stderr, stdin, stdout
from tempfile import mkdtemp
from threading import Thread
import argparse

# constants
HIVTRACE_TRUE_APPEND_VERSION = '0.0.2'
DEFAULT_TN93_ARGS = ''
DEFAULT_TN93_PATH = 'tn93'
MIN_TN93_VERSION = '1.0.14'
STDIO = {'stderr':stderr, 'stdin':stdin, 'stdout':stdout}

# check TN93 version
def check_tn93_version(tn93_path):
A, B, C = [int(v) for v in MIN_TN93_VERSION.split('.')]
user_version = run([tn93_path, '--version'], capture_output=True).stderr.decode().strip().lstrip('v')
a, b, c = [int(v) for v in user_version.split('.')]
if (a < A) or (a == A and b < B) or (a == A and b == B and c < C):
raise ValueError("Installed tn93 version (%s) is below the minimum required version (%s)" % (user_version, MIN_TN93_VERSION))

# return the current time as a string
def get_time():
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# print to log (prefixed by current time)
def print_log(s='', end='\n'):
print("[%s] %s" % (get_time(), s), file=stderr, end=end); stderr.flush()

# open file and return file object
def open_file(fn, mode='r', text=True):
if fn in STDIO:
return STDIO[fn]
elif fn.lower().endswith('.gz'):
if mode == 'a':
raise NotImplementedError("Cannot append to gzip file")
if text:
mode += 't'
return gopen(fn, mode)
else:
return open(fn, mode)

# create and yield a temporary FIFO as a context manager: https://stackoverflow.com/a/54895027/2134991
@contextmanager
def create_fifo():
tmp_d = mkdtemp(); tmp_fn = '%s/fifo' % tmp_d; mkfifo(tmp_fn)
try:
yield tmp_fn
finally:
unlink(tmp_fn); rmdir(tmp_d)

# parse user args
def parse_args():
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-it', '--input_user_table', required=True, type=str, help="Input: User table (CSV)")
parser.add_argument('-iT', '--input_old_table', required=True, type=str, help="Input: Old table (CSV)")
parser.add_argument('-iD', '--input_old_dists', required=True, type=str, help="Input: Old TN93 distances (CSV)")
parser.add_argument('-oD', '--output_dists', required=False, type=str, default='stdout', help="Output: Updated TN93 distances (CSV)")
parser.add_argument('--tn93_args', required=False, type=str, default=DEFAULT_TN93_ARGS, help="Optional tn93 arguments")
parser.add_argument('--tn93_path', required=False, type=str, default=DEFAULT_TN93_PATH, help="Path to tn93 executable")
args = parser.parse_args()
for fn in [args.input_user_table, args.input_old_table, args.input_old_dists]:
if not isfile(fn) and not fn.startswith('/dev/fd'):
raise ValueError("File not found: %s" % fn)
for fn in [args.output_dists]:
if fn.lower().endswith('.gz'):
raise ValueError("Cannot directly write to gzip output file. To gzip the output, specify 'stdout' as the output file, and then pipe to gzip.")
if isfile(fn):
raise ValueError("File exists: %s" % fn)
return args

# parse input table
# Argument: `input_table` = path to input table CSV
# Return: `dict` in which keys are EHARS UIDs and values are clean_seqs
def parse_table(input_table_fn):
# set things up
header_row = None; col2ind = None; seqs = dict()
infile = open_file(input_table_fn)

# load sequences from table (and potentially write output FASTA)
for row in reader(infile):
# parse header row
if header_row is None:
header_row = row; col2ind = {k:i for i,k in enumerate(header_row)}
for k in ['ehars_uid', 'clean_seq']:
if k not in col2ind:
raise ValueError("Column '%s' missing from input user table: %s" % (k, input_user_table))

# parse sequence row
else:
ehars_uid = row[col2ind['ehars_uid']].strip(); clean_seq = row[col2ind['clean_seq']].strip().upper()
if ehars_uid in seqs:
raise ValueError("Duplicate EHARS UID (%s) in file: %s" % (ehars_uid, input_table))
seqs[ehars_uid] = clean_seq

# clean up and return
infile.close()
return seqs

# determine dataset deltas
# Argument: `seqs_new` = `dict` where keys are user-uploaded sequence IDs and values are sequences
# Argument: `seqs_old` = `dict` where keys are existing sequence IDs and values are sequences
# Return: `to_add` = `set` containing IDs in `seqs_new` that need to be added to `seqs_old`
# Return: `to_replace` = `set` containing IDs in `seqs_old` whose sequences need to be updated with those in `seqs_new`
# Return: `to_delete` = `set` containing IDs in `seqs_old` that need to be deleted
# Return: `to_keep` = `set` containing IDs in `seqs_old` that need to be kept as-is
def determine_deltas(seqs_new, seqs_old):
to_add = set(); to_replace = set(); to_delete = set(seqs_old.keys()); to_keep = set()
for ID in seqs_new:
if ID in seqs_old:
to_delete.remove(ID)
if seqs_new[ID] == seqs_old[ID]:
to_keep.add(ID)
else:
to_replace.add(ID)
else:
to_add.add(ID)
return to_add, to_replace, to_delete, to_keep

# remove IDs from TN93 distance CSV
# Argument: `in_dists_fn` = path to input old TN93 distances file
# Argument: `out_dists_file` = write/append-mode file object to output TN93 distances file
# Argument: `to_keep` = `set` containing IDs to keep in TN93 distances file
# Argument: `remove_header` = `True` to remove the header in the output file, otherwise `False`
def remove_IDs_tn93(in_dists_fn, out_dists_file, to_keep, remove_header=True):
infile = open_file(in_dists_fn)
for row_num, line in enumerate(infile):
row = [v.strip() for v in line.split(',')]
if (row_num == 0 and not remove_header) or (row_num != 0 and row[0] in to_keep and row[1] in to_keep):
out_dists_file.write(line)
infile.close()

# run tn93 on all new-new and new-old pairs
# Argument: `seqs_new` = `dict` where keys are user-uploaded sequence IDs and values are sequences
# Argument: `seqs_old` = `dict` where keys are existing sequence IDs and values are sequences
# Argument: `out_dists_file` = write/append-mode file object to output TN93 distances file
# Argument: `to_add` = `set` containing IDs to add to TN93 distances file
# Argument: `to_replace` = `set` containing IDs in `seqs_old` whose sequences need to be updated with those in `seqs_new`
# Argument: `to_keep` = `set` containing IDs to keep in TN93 distances file
# Argument: `tn93_args` = string containing optional tn93 arguments
# Argument: `tn93_path` = path to tn93 executable
def run_tn93(seqs_new, seqs_old, out_dists_file, to_add, to_replace, to_keep, remove_header=True, tn93_args=DEFAULT_TN93_ARGS, tn93_path=DEFAULT_TN93_PATH):
# set things up
tn93_base_command = [tn93_path] + [v.strip() for v in tn93_args]
new_fasta_data = ''.join('>%s\n%s\n' % (k,seqs_new[k]) for k in (to_add | to_replace)).encode('utf-8')

# calculate new-new distances
if len(new_fasta_data) != 0:
tn93_command_new_new = list(tn93_base_command)
if remove_header:
tn93_command_new_new.append('-n')
run(tn93_command_new_new, input=new_fasta_data)

# calculate new-old distances
if len(new_fasta_data) != 0 and len(to_keep) != 0:
old_fasta_data = ''.join('>%s\n%s\n' % (k,seqs_old[k]) for k in to_keep).encode('utf-8')
with create_fifo() as tmp_fifo_fn:
def write_to_fifo(data):
with open(tmp_fifo_fn, 'wb') as tmp_fifo:
tmp_fifo.write(data)
write_thread = Thread(target=write_to_fifo, args=(new_fasta_data,))
write_thread.start()
tn93_command_new_old = tn93_base_command + ['-n', '-s', tmp_fifo_fn]
run(tn93_command_new_old, input=old_fasta_data, stdout=out_dists_file)
write_thread.join()

# main True Append program
def true_append(seqs_new=None, seqs_old=None, input_old_dists=None, output_dists=None, tn93_args=DEFAULT_TN93_ARGS, tn93_path=DEFAULT_TN93_PATH):
print_log("Running HIV-TRACE True Append v%s" % HIVTRACE_TRUE_APPEND_VERSION)
if seqs_new is None: # args not provided, so parse from command line
args = parse_args()
print_log("Command: %s" % ' '.join(argv))
print_log("Parsing user table: %s" % args.input_user_table)
seqs_new = parse_table(args.input_user_table)
print_log("Parsing old table: %s" % args.input_old_table)
seqs_old = parse_table(args.input_old_table)
output_dists = args.output_dists
input_old_dists = args.input_old_dists
tn93_args = args.tn93_args
tn93_path = args.tn93_path
check_tn93_version(tn93_path)
print_log("- Num New Sequences: %s" % len(seqs_new))
print_log("- Num Old Sequences: %s" % (len(seqs_old)))
print_log("Determining deltas between new seqs and old seqs ...")
to_add, to_replace, to_delete, to_keep = determine_deltas(seqs_new, seqs_old)
print_log("- Add: %s" % len(to_add))
print_log("- Replace: %s" % len(to_replace))
print_log("- Delete: %s" % len(to_delete))
print_log("- Do nothing: %s" % (len(to_keep)))
print_log("Creating output TN93 distances CSV: %s" % output_dists)
output_dists_file = open_file(output_dists, 'w')
print_log("Copying old TN93 distances from: %s" % input_old_dists)
remove_IDs_tn93(input_old_dists, output_dists_file, to_keep, remove_header=False)
print_log("Calculating all new pairwise TN93 distances...")
run_tn93(seqs_new, seqs_old, output_dists_file, to_add, to_replace, to_keep, remove_header=True, tn93_args=tn93_args, tn93_path=tn93_path)

# run main program
if __name__ == "__main__":
true_append() # running with default args will use argparse

0 comments on commit 67e350d

Please sign in to comment.