From 6f290a02b71ef60a3aee720f0ff2a9d89f3eb270 Mon Sep 17 00:00:00 2001 From: James Ferguson Date: Sat, 15 Sep 2018 14:14:59 +1000 Subject: [PATCH 01/11] Cite update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2d2d5ca..ab7ca1c 100644 --- a/README.md +++ b/README.md @@ -361,7 +361,7 @@ I would like to thank the rest of my lab in Genomic Technologies team from the [ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1413903.svg)](https://doi.org/10.5281/zenodo.1413903) -James M. Ferguson. Psy-Fer/fast5_fetcher: Initial release of fast5_fetcher. (2018). doi:10.5281/zenodo.1413903 +James M. Ferguson, & Martin A. Smith. (2018, September 12). Psy-Fer/fast5_fetcher: Initial release of fast5_fetcher (Version v1.0). Zenodo. http://doi.org/10.5281/zenodo.1413903 ## License From 598ec4f70d9e96db86d4854c4afa88d3f589fc8f Mon Sep 17 00:00:00 2001 From: James Ferguson Date: Sun, 21 Oct 2018 22:51:27 +1100 Subject: [PATCH 02/11] Small refactor, cut out ~70 lines of code, batch_tater updates --- README.md | 6 +- batch_tater.py | 28 +++-- fast5_fetcher.py | 306 ++++++++++++++++++----------------------------- 3 files changed, 138 insertions(+), 202 deletions(-) diff --git a/README.md b/README.md index ab7ca1c..67be3c8 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Reducing the number of fast5 files per folder in a single experiment was a welcomed addition to MinKnow. However this also made it rather useful for manual basecalling on a cluster, using array jobs, where each folder is basecalled individually, producing its own `sequencing_summary.txt`, `reads.fastq`, and reads folder containing the newly basecalled fast5s. Taring those fast5 files up into a single file was needed to keep the sys admins at bay, complaining about our millions of individual files on their drives. This meant, whenever there was a need to use the fast5 files from an experiment, or many experiments, unpacking the fast5 files was a significant hurdle both in time and disk space. -**fast5_fetcher** was built to address this bottleneck. By building an index file of the tarballs, and using the `sequencing_summary.txt` file to match readIDs with fast5 filenames, only the fast5 files you need can be extracted, either temporarily in a pipeline, or permanently, reducing space and simplifying downstream work flows. +**fast5_fetcher** was built to address this bottleneck. By building an index file of the tarballs, and using the `sequencing_summary.txt` file to match readIDs with fast5 filenames, only the fast5 files you need can be extracted, either temporarily in a pipeline, or permanently, reducing space and simplifying downstream work flows. # Requirements @@ -312,7 +312,7 @@ Potato scripting engaged This is designed to run on the output files from `fast5_fetcher.py` using option `-z`. This writes out file lists for each tarball that contains reads you want to process. Then `batch_tater.py` can read those files, to open the individual tar files, and extract the files, meaning the file is only opened once. -A recent test using the -z option on ~1.4Tb of data, across ~16/20 million files took about 10min (1CPU) to write and organise the file lists with fast5_fetch.py, and about 2min per array job to extract and repackage with batch_tater.py. +A recent test using the -z option on ~2.2Tb of data, across ~11/27 million files took about 10min (1CPU) to write and organise the file lists with fast5_fetch.py, and about 20s per array job to extract and repackage with batch_tater.py. This is best used when you want to do something all at once and filter your reads. Other approaches may be better when you are demultiplexing. @@ -331,7 +331,7 @@ BLAH=fast5/${FILE} mkdir ${TMPDIR}/fast5 -time python batch_tater.py name.index.gz ${BLAH} ${TMPDIR}/fast5/ +time python batch_tater.py tater_master.txt ${BLAH} ${TMPDIR}/fast5/ echo "size of files:" >&2 du -shc ${TMPDIR}/fast5/ >&2 diff --git a/batch_tater.py b/batch_tater.py index 0256a5a..48237f6 100644 --- a/batch_tater.py +++ b/batch_tater.py @@ -24,7 +24,7 @@ mkdir ${TMPDIR}/fast5 - time python batch_tater.py tar_index.txt ${BLAH} ${TMPDIR}/fast5/ + time python batch_tater.py tater_master.txt ${BLAH} ${TMPDIR}/fast5/ echo "size of files:" >&2 du -shc ${TMPDIR}/fast5/ >&2 @@ -44,30 +44,40 @@ Launch: echo $CMD && $CMD + + + stats: + + fastq: 27491304 + mapped: 11740093 + z mode time: 10min + batch_tater total time: 21min + per job time: ~28s + number of CPUs: 100 ''' # being lazy and using sys.argv...i mean, it is pretty lit -index = sys.argv[1] +master = sys.argv[1] tar_list = sys.argv[2] save_path = sys.argv[3] # this will probs need to be changed based on naming convention # I think i was a little tired when I wrote this -tar_name = '.'.join(tar_list.split('/')[-1].split('.')[:3]) +# tar_name = '.'.join(tar_list.split('/')[-1].split('.')[:3]) PATH = 0 -# for stats later and easy job relauncing -print >> sys.stderr, "extracting:", tar_name - # not elegent, but gets it done -with open(index, 'r') as f: +with open(master, 'r') as f: for l in f: l = l.strip('\n') - if tar_name in l.split('/'): - PATH = l + l = l.split('\t') + if l[0] == tar_list: + PATH = l[1] break +# for stats later and easy job relauncing +print >> sys.stderr, "extracting:", tar_list # do the thing. That --transform hack is awesome. Blows away all the leading folders. if PATH: cmd = "tar -xf {} --transform='s/.*\///' -C {} -T {}".format( diff --git a/fast5_fetcher.py b/fast5_fetcher.py index e810153..5596c4f 100644 --- a/fast5_fetcher.py +++ b/fast5_fetcher.py @@ -13,7 +13,7 @@ Garvan Institute Copyright 2017 - Fast5 Fetcher is designed to help manage fast5 file data storage and organisation. + fast5_fetcher is designed to help manage fast5 file data storage and organisation. It takes 3 files as input: fastq/paf/flat, sequencing_summary, index -------------------------------------------------------------------------------------- @@ -25,7 +25,8 @@ version 0.6 - did a dumb. changed x in s to set/dic entries O(n) vs O(1) version 0.7 - cleaned up a bit to share and removed some hot and steamy features version 0.8 - Added functionality for un-tarred file structures and seq_sum only - version 1.0 - First release + version 1.0 - First release + version 1.1 - refactor with dicswitch and batch_tater updates TODO: @@ -33,6 +34,8 @@ - autodetect file structures - autobuild index file - make it a sub script as well - Consider using csv.DictReader() instead of wheel building + - flesh out batch_tater and give better examples and clearer how-to + - options to build new index/SS of fetched fast5s ----------------------------------------------------------------------------- MIT License @@ -113,27 +116,6 @@ def main(): ids = get_flat_reads(args.flat) filenames = get_filenames(args.seq_sum, ids) - ''' - if args.fast5: - paths = get_paths(args.index, filenames, args.fast5) - # place multiprocessing pool here - # print >> sys.stderr, "All the paths: \n", paths - print >> sys.stderr, "extracting..." - for p, f in paths: - if args.pppp: - if p in p_dic: - p_dic[p].append(f) - else: - p_dic[p] = [f] - continue - try: - #print >> sys.stderr, "extracting:", p, f - extract_file(p, f, args.output) - except: - traceback.print_exc() - print >> sys.stderr, "Failed to extract:", p, f - else: - ''' paths = get_paths(args.index, filenames) print >> sys.stderr, "extracting..." # place multiprocessing pool here @@ -144,24 +126,40 @@ def main(): else: p_dic[p] = [f] continue - try: - extract_file(p, f, args.output) - except: - traceback.print_exc() - print >> sys.stderr, "Failed to extract:", p, f + else: + try: + extract_file(p, f, args.output) + except: + traceback.print_exc() + print >> sys.stderr, "Failed to extract:", p, f # For each .tar file, write a file with the tarball name as filename.tar.txt # and contains a list of files to extract - input for batch_tater.py if args.pppp: - for i in p_dic: - fname = args.output + i.split('/')[-1] + ".txt" - with open(fname, 'w') as f: - for j in p_dic[i]: - f.write(j) - f.write('\n') + with open("tater_master.txt", 'w') as m: + for i in p_dic: + fname = "tater_" + i.split('/')[-1] + ".txt" + m_entry = "{}\t{}".format(fname, i) + m.write(m_entry) + m.write('\n') + with open(fname, 'w') as f: + for j in p_dic[i]: + f.write(j) + f.write('\n') print >> sys.stderr, "done!" +def dicSwitch(i): + ''' + A switch to handle file opening and reduce duplicated code + ''' + open_method = { + "gz": gzip.open, + "norm": open + } + return open_method[i] + + def get_fq_reads(fastq): ''' read fastq file and extract read ids @@ -170,25 +168,20 @@ def get_fq_reads(fastq): c = 0 read_ids = set() if fastq.endswith('.gz'): - with gzip.open(fastq, 'rb') as gz: - for line in io.BufferedReader(gz): - c += 1 - line = line.strip('\n') - if c == 1: - idx = line.split()[0][1:] - read_ids.add(idx) - elif c >= 4: - c = 0 + f_read = dicSwitch('gz') else: - with open(fastq, 'rb') as fq: - for line in fq: - c += 1 - line = line.strip('\n') - if c == 1: - idx = line.split()[0][1:] - read_ids.add(idx) - elif c >= 4: - c = 0 + f_read = dicSwitch('norm') + with f_read(fastq, 'rb') as fq: + if fastq.endswith('.gz'): + fq = io.BufferedReader(fq) + for line in fq: + c += 1 + line = line.strip('\n') + if c == 1: + idx = line.split()[0][1:] + read_ids.add(idx) + elif c >= 4: + c = 0 return read_ids @@ -197,7 +190,13 @@ def get_paf_reads(reads): Parse paf file to pull read ids (from minimap2 alignment) ''' read_ids = set() - with open(reads, 'rb') as fq: + if reads.endswith('.gz'): + f_read = dicSwitch('gz') + else: + f_read = dicSwitch('norm') + with f_read(reads, 'rb') as fq: + if reads.endswith('.gz'): + fq = io.BufferedReader(fq) for line in fq: line = line.strip('\n') line = line.split() @@ -213,29 +212,22 @@ def get_flat_reads(filename): read_ids = set() check = True if filename.endswith('.gz'): - with gzip.open(filename, 'rb') as gz: - for line in io.BufferedReader(gz): - line = line.strip('\n') - if check: - if line[0] == '@': - x = 1 - else: - x = 0 - check = False - idx = line[x:] - read_ids.add(idx) + f_read = dicSwitch('gz') else: - with open(filename, 'rb') as fq: - for line in fq: - line = line.strip('\n') - if check: - if line[0] == '@': - x = 1 - else: - x = 0 - check = False - idx = line[x:] - read_ids.add(idx) + f_read = dicSwitch('norm') + with f_read(filename, 'rb') as fq: + if filename.endswith('.gz'): + fq = io.BufferedReader(fq) + for line in fq: + line = line.strip('\n') + if check: + if line[0] == '@': + x = 1 + else: + x = 0 + check = False + idx = line[x:] + read_ids.add(idx) return read_ids @@ -251,31 +243,23 @@ def get_filenames(seq_sum, ids): head = True files = set() if seq_sum.endswith('.gz'): - with gzip.open(seq_sum, 'rb') as sz: - for line in io.BufferedReader(sz): - if head: - head = False - continue - line = line.strip('\n') - line = line.split() - if ss_only: - files.add(line[0]) - else: - if line[1] in ids: - files.add(line[0]) + f_read = dicSwitch('gz') else: - with open(seq_sum, 'rb') as ss: - for line in ss: - if head: - head = False - continue - line = line.strip('\n') - line = line.split() - if ss_only: + f_read = dicSwitch('norm') + with f_read(seq_sum, 'rb') as sz: + if seq_sum.endswith('.gz'): + sz = io.BufferedReader(sz) + for line in sz: + if head: + head = False + continue + line = line.strip('\n') + line = line.split() + if ss_only: + files.add(line[0]) + else: + if line[1] in ids: files.add(line[0]) - else: - if line[1] in ids: - files.add(line[0]) return files @@ -287,102 +271,44 @@ def get_paths(index_file, filenames, f5=None): paths = [] c = 0 if index_file.endswith('.gz'): - with gzip.open(index_file, 'rb') as idz: - for line in io.BufferedReader(idz): - line = line.strip('\n') - c += 1 - if c > 10: - break - if line.endswith('.tar'): - tar = True - break + f_read = dicSwitch('gz') else: - with open(index_file, 'rb') as idx: - for line in idx: - line = line.strip('\n') - c += 1 - if c > 10: - break - if line.endswith('.tar'): - tar = True - break - ''' - if f5: - locker = False + f_read = dicSwitch('norm') + # detect normal or tars + with f_read(index_file, 'rb') as idz: if index_file.endswith('.gz'): - with gzip.open(index_file, 'rb') as idz: - for line in io.BufferedReader(idz): - line = line.strip('\n') - if line.endswith('.tar'): - if line.split('/')[-1] == f5.split('/')[-1]: - path = line - locker = False - else: - locker = True - elif line.endswith('.fast5') and not locker: - f = line.split('/')[-1] - if f in filenames: - paths.append([path, line]) - else: - continue - else: - with open(index_file, 'rb') as idx: - for line in idx: - line = line.strip('\n') - if line.endswith('.tar'): - if line.split('/')[-1] == f5.split('/')[-1]: - path = line - locker = False - else: - locker = True - elif line.endswith('.fast5') and not locker: - f = line.split('/')[-1] - if f in filenames: - paths.append([path, line]) - else: - continue - else: - ''' - if index_file.endswith('.gz'): - with gzip.open(index_file, 'rb') as idz: - for line in io.BufferedReader(idz): - line = line.strip('\n') - if tar: - if line.endswith('.tar'): - path = line - elif line.endswith('.fast5'): - f = line.split('/')[-1] - if f in filenames: - paths.append([path, line]) - else: - continue + idz = io.BufferedReader(idz) + for line in idz: + line = line.strip('\n') + c += 1 + if c > 10: + break + if line.endswith('.tar'): + tar = True + break + # extract paths + with f_read(index_file, 'rb') as idz: + if index_file.endswith('.gz'): + idz = io.BufferedReader(idz) + for line in idz: + line = line.strip('\n') + if tar: + if line.endswith('.tar'): + path = line + elif line.endswith('.fast5'): + f = line.split('/')[-1] + if f in filenames: + paths.append([path, line]) else: - if line.endswith('.fast5'): - f = line.split('/')[-1] - if f in filenames: - paths.append(['', line]) - else: - continue - else: - with open(index_file, 'rb') as idx: - for line in idx: - line = line.strip('\n') - if tar: - if line.endswith('.tar'): - path = line - elif line.endswith('.fast5'): - f = line.split('/')[-1] - if f in filenames: - paths.append([path, line]) - else: - continue + continue + else: + if line.endswith('.fast5'): + f = line.split('/')[-1] + if f in filenames: + paths.append(['', line]) else: - if line.endswith('.fast5'): - f = line.split('/')[-1] - if f in filenames: - paths.append(['', line]) - else: - continue + continue + return paths From 99bc6a590772f2abed14b91fc8a5c2d0432c41ca Mon Sep 17 00:00:00 2001 From: James Ferguson Date: Sun, 21 Oct 2018 22:53:31 +1100 Subject: [PATCH 03/11] acknowledgement update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 67be3c8..b0e3e6e 100644 --- a/README.md +++ b/README.md @@ -355,7 +355,7 @@ echo $CMD && $CMD ## Acknowledgements -I would like to thank the rest of my lab in Genomic Technologies team from the [Garvan Institute](https://www.garvan.org.au/) for their feedback on the development of this tool. +I would like to thank the rest of my lab (Shaun Carswell, Kirston Barton) in Genomic Technologies team from the [Garvan Institute](https://www.garvan.org.au/) for their feedback on the development of this tool. ## Cite From 8bb3863ca7c2ad0e3dde4d7aa26225b9b80047e6 Mon Sep 17 00:00:00 2001 From: James Ferguson Date: Tue, 30 Oct 2018 20:34:04 +1100 Subject: [PATCH 04/11] fixed tar/gtar problem with MacOS, added OS detection --- README.md | 19 +++++++++++++----- fast5_fetcher.py | 52 +++++++++++++++++++++++++++++++----------------- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index b0e3e6e..1935a9e 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ **fast5_fetcher** is a tool for fetching nanopore fast5 files to save time and simplify downstream analysis. - [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1413903.svg)](https://doi.org/10.5281/zenodo.1413903) ## Contents @@ -21,9 +20,11 @@ Reducing the number of fast5 files per folder in a single experiment was a welco Following a self imposed guideline, most things written to handle nanopore data or bioinformatics in general, will use as little 3rd party libraries as possible, aiming for only core libraries, or have all included files in the package. -In the case of `fast5_fetcher.py` and `batch_tater.py`, only core python libraries are used. So as long as **Python 2.7+** is present, everything should work with no extra steps. (Python 3 compatibility is coming in the next update) +In the case of `fast5_fetcher.py` and `batch_tater.py`, only core python libraries are used. So as long as **Python 2.7+** is present, everything should work with no extra steps. (Python 3 compatibility is coming in the next big update) + +##### Operating system: -There is one catch. Everything is written primarily for use with **Linux**. Due to **MacOS** running on Unix, so long as the GNU tools are installed, there should be minimal issues running it. **Windows 10** however may require more massaging to work with the new Linux integration. +There is one catch. Everything is written primarily for use with **Linux**. Due to **MacOS** running on Unix, so long as the GNU tools are installed (see below), there should be minimal issues running it. **Windows 10** however may require more massaging to work with the new Linux integration. # Getting Started @@ -187,6 +188,14 @@ Download the repository: git clone https://github.com/Psy-Fer/fast5_fetcher.git +If using MacOS, and NOT using homebrew, install it here: + + https://brew.sh/ + +then install gnu-tar with: + + brew install gnu-tar + ### Quick start Basic use on a local computer @@ -355,13 +364,13 @@ echo $CMD && $CMD ## Acknowledgements -I would like to thank the rest of my lab (Shaun Carswell, Kirston Barton) in Genomic Technologies team from the [Garvan Institute](https://www.garvan.org.au/) for their feedback on the development of this tool. +I would like to thank the rest of my lab (Shaun Carswell, Kirston Barton, Kai Martin) in Genomic Technologies team from the [Garvan Institute](https://www.garvan.org.au/) for their feedback on the development of this tool. ## Cite [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1413903.svg)](https://doi.org/10.5281/zenodo.1413903) -James M. Ferguson, & Martin A. Smith. (2018, September 12). Psy-Fer/fast5_fetcher: Initial release of fast5_fetcher (Version v1.0). Zenodo. http://doi.org/10.5281/zenodo.1413903 +James M. Ferguson, & Martin A. Smith. (2018, September 12). Psy-Fer/fast5_fetcher: Initial release of fast5_fetcher (Version v1.0). Zenodo. ## License diff --git a/fast5_fetcher.py b/fast5_fetcher.py index 5596c4f..b655211 100644 --- a/fast5_fetcher.py +++ b/fast5_fetcher.py @@ -5,6 +5,7 @@ import subprocess import traceback import argparse +import platform from functools import partial ''' @@ -17,17 +18,17 @@ It takes 3 files as input: fastq/paf/flat, sequencing_summary, index -------------------------------------------------------------------------------------- - version 0.0 - initial - version 0.2 - added argparser and buffered gz streams - version 0.3 - added paf input - version 0.4 - added read id flat file input - version 0.5 - pppp print output instead of extracting - version 0.6 - did a dumb. changed x in s to set/dic entries O(n) vs O(1) - version 0.7 - cleaned up a bit to share and removed some hot and steamy features - version 0.8 - Added functionality for un-tarred file structures and seq_sum only - version 1.0 - First release - version 1.1 - refactor with dicswitch and batch_tater updates - + version 0.0 - initial + version 0.2 - added argparser and buffered gz streams + version 0.3 - added paf input + version 0.4 - added read id flat file input + version 0.5 - pppp print output instead of extracting + version 0.6 - did a dumb. changed x in s to set/dic entries O(n) vs O(1) + version 0.7 - cleaned up a bit to share and removed some hot and steamy features + version 0.8 - Added functionality for un-tarred file structures and seq_sum only + version 1.0 - First release + version 1.1 - refactor with dicswitch and batch_tater updates + version 1.1.1 - Bug fix on --transform method, added OS detection TODO: - Python 3 compatibility @@ -82,8 +83,8 @@ def main(): help="paf alignment file for read ids") group.add_argument("-f", "--flat", help="flat file of read ids") - # parser.add_argument("-b", "--fast5", - # help="fast5.tar path to extract from - individual") + parser.add_argument("--OSystem", default=platform.system(), + help="running operating system - leave default unless doing odd stuff") parser.add_argument("-s", "--seq_sum", help="sequencing_summary.txt.gz file") parser.add_argument("-i", "--index", @@ -128,7 +129,7 @@ def main(): continue else: try: - extract_file(p, f, args.output) + extract_file(args, p, f) except: traceback.print_exc() print >> sys.stderr, "Failed to extract:", p, f @@ -312,17 +313,32 @@ def get_paths(index_file, filenames, f5=None): return paths -def extract_file(path, filename, save_path): +def extract_file(args, path, filename): ''' Do the extraction. I was using the tarfile python lib, but honestly, it sucks and was too volatile. if you have a better suggestion, let me know :) - That --transform hack is awesome btw. Blows away all the leading folders. use it + That --transform hack is awesome btw. Blows away all the leading folders. use cp for when using untarred structures. Not recommended, but here for completeness. + + --transform not working on MacOS. Need to use gtar + Thanks to Kai Martin for picking that one up! + ''' + OSystem = "" + OSystem = args.OSystem + save_path = args.output if path.endswith('.tar'): - cmd = "tar -xf {} --transform='s/.*\///' -C {} {}".format( - path, save_path, filename) + if OSystem in ["Linux", "Windows"]: + cmd = "tar -xf {} --transform='s/.*\///' -C {} {}".format( + path, save_path, filename) + elif OSystem == "Darwin": + cmd = "gtar -xf {} --transform='s/.*\///' -C {} {}".format( + path, save_path, filename) + else: + print >> sys.stderr, "Unsupported OSystem, trying Tar anyway, OS:", OSystem + cmd = "tar -xf {} --transform='s/.*\///' -C {} {}".format( + path, save_path, filename) else: cmd = "cp {} {}".format(filename, os.path.join( save_path, filename.split('/')[-1])) From 2d5fe840870c44ce8637fac41129db4aebf53afd Mon Sep 17 00:00:00 2001 From: James Ferguson Date: Mon, 12 Nov 2018 17:48:13 +1100 Subject: [PATCH 05/11] Fixed error with first argument rename --- batch_tater.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/batch_tater.py b/batch_tater.py index 48237f6..2399631 100644 --- a/batch_tater.py +++ b/batch_tater.py @@ -86,4 +86,4 @@ else: print >> sys.stderr, "PATH not found! check index nooblet" - print >> sys.stderr, "inputs:", index, tar_list, tar_name + print >> sys.stderr, "inputs:", master, tar_list, tar_name From 2bc96b19420b958ece51dfb86f52d953d3fd59fc Mon Sep 17 00:00:00 2001 From: James Ferguson Date: Mon, 12 Nov 2018 18:27:50 +1100 Subject: [PATCH 06/11] fixed problem with using a path in tar_list --- batch_tater.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/batch_tater.py b/batch_tater.py index 2399631..f365b33 100644 --- a/batch_tater.py +++ b/batch_tater.py @@ -63,7 +63,7 @@ # this will probs need to be changed based on naming convention # I think i was a little tired when I wrote this -# tar_name = '.'.join(tar_list.split('/')[-1].split('.')[:3]) +tar_list = tar_list.split('/')[-1] PATH = 0 @@ -86,4 +86,4 @@ else: print >> sys.stderr, "PATH not found! check index nooblet" - print >> sys.stderr, "inputs:", master, tar_list, tar_name + print >> sys.stderr, "inputs:", master, tar_list, save_path From 47fd0e9f5e18375f8cb03c9af1868fddb7cc9a75 Mon Sep 17 00:00:00 2001 From: James Ferguson Date: Mon, 12 Nov 2018 18:29:49 +1100 Subject: [PATCH 07/11] fixed comparison problem --- batch_tater.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/batch_tater.py b/batch_tater.py index f365b33..0e4bad9 100644 --- a/batch_tater.py +++ b/batch_tater.py @@ -63,7 +63,7 @@ # this will probs need to be changed based on naming convention # I think i was a little tired when I wrote this -tar_list = tar_list.split('/')[-1] +list_name = tar_list.split('/')[-1] PATH = 0 @@ -72,7 +72,7 @@ for l in f: l = l.strip('\n') l = l.split('\t') - if l[0] == tar_list: + if l[0] == list_name: PATH = l[1] break From abbfc135888e7ceb26c52b3cb4394c8eae28667c Mon Sep 17 00:00:00 2001 From: James Ferguson Date: Tue, 13 Nov 2018 14:09:22 +1100 Subject: [PATCH 08/11] Make -z option put files in output dir --- fast5_fetcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fast5_fetcher.py b/fast5_fetcher.py index b655211..169425d 100644 --- a/fast5_fetcher.py +++ b/fast5_fetcher.py @@ -140,6 +140,7 @@ def main(): for i in p_dic: fname = "tater_" + i.split('/')[-1] + ".txt" m_entry = "{}\t{}".format(fname, i) + fname = args.output + "/tater_" + i.split('/')[-1] + ".txt" m.write(m_entry) m.write('\n') with open(fname, 'w') as f: From d08632ec6b38ec8b1edd53703039b5936f43e14a Mon Sep 17 00:00:00 2001 From: James Ferguson Date: Mon, 3 Dec 2018 17:08:45 +1100 Subject: [PATCH 09/11] added trimming for experiment partitioning --- README.md | 67 ++++++++++++++++++++------ fast5_fetcher.py | 120 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 168 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 1935a9e..8cab70f 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,27 @@ ## Contents + + +- [Background](#Background) +- [Requirements](#Requirements) +- [Installation](#installation) +- [Getting Started](<#Getting Started>) + - [File structures](<#File structures>) + - [1. Raw structure (not preferred)](<#1. Raw structure>) + - [2. Local basecalled structure](<#2. Local basecalled structure>) + - [3. Parallel basecalled structure](<#3. Parallel basecalled structure>) + - [Inputs](#Inputs) +- [Instructions for use](<#Instructions for use>) + - [Quick start](<#Quick start>) + - [fast5_fetcher.py](#fast5_fetcher.py) + - [Examples](#Examples) + - [batch_tater.py](#batch_tater.py) +- [Acknowledgements](#Acknowledgements) +- [Cite](#Cite) +- [License](#License) + + # Background Reducing the number of fast5 files per folder in a single experiment was a welcomed addition to MinKnow. However this also made it rather useful for manual basecalling on a cluster, using array jobs, where each folder is basecalled individually, producing its own `sequencing_summary.txt`, `reads.fastq`, and reads folder containing the newly basecalled fast5s. Taring those fast5 files up into a single file was needed to keep the sys admins at bay, complaining about our millions of individual files on their drives. This meant, whenever there was a need to use the fast5 files from an experiment, or many experiments, unpacking the fast5 files was a significant hurdle both in time and disk space. @@ -230,25 +251,35 @@ See examples below for use on an **HPC** using **SGE** #### Full usage - usage: fast5_fetcher.py [-h] [-q FASTQ | -p PAF | -f FLAT] [-s SEQ_SUM] - [-i INDEX] [-o OUTPUT] [-z] + usage: fast5_fetcher.py [-h] [-q FASTQ | -p PAF | -f FLAT] [--OSystem OSYSTEM] + [-s SEQ_SUM] [-i INDEX] [-o OUTPUT] [-t] + [-l TRIM_LIST] [-x PREFIX] [-z] fast_fetcher - extraction of specific nanopore fast5 files optional arguments: - -h, --help show this help message and exit - -q FASTQ, --fastq FASTQ - fastq.gz for read ids - -p PAF, --paf PAF paf alignment file for read ids - -f FLAT, --flat FLAT flat file of read ids - -s SEQ_SUM, --seq_sum SEQ_SUM - sequencing_summary.txt.gz file - -i INDEX, --index INDEX - index.gz file mapping fast5 files in tar archives - -o OUTPUT, --output OUTPUT - output directory for extracted fast5s - -z, --pppp Print out tar commands in batches for further - processing + -h, --help show this help message and exit + -q FASTQ, --fastq FASTQ + fastq.gz for read ids + -p PAF, --paf PAF paf alignment file for read ids + -f FLAT, --flat FLAT flat file of read ids + --OSystem OSYSTEM running operating system - leave default unless doing + odd stuff + -s SEQ_SUM, --seq_sum SEQ_SUM + sequencing_summary.txt.gz file + -i INDEX, --index INDEX + index.gz file mapping fast5 files in tar archives + -o OUTPUT, --output OUTPUT + output directory for extracted fast5s + -t, --trim trim files as if standalone experiment, (fq, SS) + -l TRIM_LIST, --trim_list TRIM_LIST + list of file names to trim, comma separated. fastq + only needed for -p and -f modes + -x PREFIX, --prefix PREFIX + trim file prefix, eg: barcode_01, output: + barcode_01.fastq, barcode_01_seq_sum.txt + -z, --pppp Print out tar commands in batches for further + processing ## Examples @@ -315,6 +346,12 @@ CMD="qsub -cwd -V -pe smp 1 -N F5F -S /bin/bash -t 1-12 -l mem_requested=20G,h_v echo $CMD && $CMD ``` +## Trimming fastq and sequencing_summary files + +By using the `-t, --trim` option, each barcode will also have its own sequencing_summary file for downstream analysis. This is particularly useful if each barcode is a different sample or experiment, as the output is as if it was it's own individual flowcell. + +This method can also trim fastq, and sequencing_summary files when using the **paf** or **flat** methods. By using the prefix option, you can label the output names, otherwise generic defaults will be used. + ## batch_tater.py Potato scripting engaged diff --git a/fast5_fetcher.py b/fast5_fetcher.py index 169425d..42106de 100644 --- a/fast5_fetcher.py +++ b/fast5_fetcher.py @@ -29,6 +29,7 @@ version 1.0 - First release version 1.1 - refactor with dicswitch and batch_tater updates version 1.1.1 - Bug fix on --transform method, added OS detection + version 1.2.0 - Added file trimming to fully segment selection TODO: - Python 3 compatibility @@ -36,7 +37,7 @@ - autobuild index file - make it a sub script as well - Consider using csv.DictReader() instead of wheel building - flesh out batch_tater and give better examples and clearer how-to - - options to build new index/SS of fetched fast5s + - options to build new index of fetched fast5s ----------------------------------------------------------------------------- MIT License @@ -91,6 +92,12 @@ def main(): help="index.gz file mapping fast5 files in tar archives") parser.add_argument("-o", "--output", help="output directory for extracted fast5s") + parser.add_argument("-t", "--trim", action="store_true", + help="trim files as if standalone experiment, (fq, SS)") + parser.add_argument("-l", "--trim_list", + help="list of file names to trim, comma separated. fastq only needed for -p and -f modes") + parser.add_argument("-x", "--prefix", default="default", + help="trim file prefix, eg: barcode_01, output: barcode_01.fastq, barcode_01_seq_sum.txt") # parser.add_argument("-t", "--procs", type=int, # help="Number of CPUs to use - TODO: NOT YET IMPLEMENTED") parser.add_argument("-z", "--pppp", action="store_true", @@ -108,14 +115,53 @@ def main(): if args.pppp: print >> sys.stderr, "PPPP state! Not extracting, exporting tar commands" + trim_pass = False + if args.trim: + SS = False + FQ = False + if args.trim_list: + A = args.trim_list.split(',') + for a in A: + if "fastq" in a: + FQ = a + elif "txt" in a: + SS = a + else: + print >> sys.stderr, "Unknown trim input. detects 'fastq' or 'txt' for files. Input:", a + else: + print >> sys.stderr, "No extra files given. Compatible with -q fastq input only" + + if args.fastq: + FQ = args.fastq + if args.seq_sum: + SS = args.seq_sum + + # final check + if FQ and SS: + trim_pass = True + print >> sys.stderr, "Trim setting detected. Writing to working direcory" + else: + print >> sys.stderr, "Unable to verify both fastq and sequencing_summary files. Please check filenames and try again. Exiting..." + sys.exit() + ids = [] if args.fastq: ids = get_fq_reads(args.fastq) + if trim_pass: + trim_SS(args, ids, SS) elif args.paf: ids = get_paf_reads(args.paf) + if trim_pass: + trim_both(args, ids, FQ, SS) elif args.flat: ids = get_flat_reads(args.flat) - filenames = get_filenames(args.seq_sum, ids) + if trim_pass: + trim_both(args, ids, FQ, SS) + if not ids and trim_pass: + filenames, ids = get_filenames(args.seq_sum, ids) + trim_both(args, ids, FQ, SS) + else: + filenames, ids = get_filenames(args.seq_sum, ids) paths = get_paths(args.index, filenames) print >> sys.stderr, "extracting..." @@ -233,6 +279,71 @@ def get_flat_reads(filename): return read_ids +def trim_SS(args, ids, SS): + ''' + Trims the sequencing_summary.txt file to only the input IDs + ''' + if args.prefix: + pre = args.prefix + "_seq_sum.txt" + else: + pre = "trimmed_seq_sum.txt" + head = True + if SS.endswith('.gz'): + f_read = dicSwitch('gz') + else: + f_read = dicSwitch('norm') + # make this compatible with dicSwitch + with open(pre, "w") as w: + with f_read(SS, 'rb') as sz: + if SS.endswith('.gz'): + sz = io.BufferedReader(sz) + for line in sz: + if head: + w.write(line) + head = False + continue + l = line.split() + if l[1] in ids: + w.write(line) + + +def trim_both(args, ids, FQ, SS): + ''' + Trims the sequencing_summary.txt and fastq files to only the input IDs + ''' + # trim the SS + trim_SS(args, ids, SS) + if args.prefix: + pre = args.prefix + ".fastq" + else: + pre = "trimmed.fastq" + + # trim the fastq + c = 0 + P = False + if FQ.endswith('.gz'): + f_read = dicSwitch('gz') + else: + f_read = dicSwitch('norm') + with open(pre, "w") as w: + with f_read(FQ, 'rb') as fq: + if FQ.endswith('.gz'): + fq = io.BufferedReader(fq) + for line in fq: + c += 1 + if c == 1: + if line.split()[0][1:] in ids: + P = True + w.write(line) + elif P and c < 4: + w.write(line) + elif c >= 4: + if P: + w.write(line) + c = 0 + P = False + + def get_filenames(seq_sum, ids): ''' match read ids with seq_sum to pull filenames @@ -241,7 +352,7 @@ def get_filenames(seq_sum, ids): ss_only = False if not ids: ss_only = True - + ids = set() head = True files = set() if seq_sum.endswith('.gz'): @@ -259,10 +370,11 @@ def get_filenames(seq_sum, ids): line = line.split() if ss_only: files.add(line[0]) + ids.add(line[1]) else: if line[1] in ids: files.add(line[0]) - return files + return files, ids def get_paths(index_file, filenames, f5=None): From 8a2856f55946713e2bbf9a51b02db04e25709d38 Mon Sep 17 00:00:00 2001 From: James Ferguson Date: Tue, 11 Dec 2018 10:31:26 +1100 Subject: [PATCH 10/11] bug fix: cp in extract file for file structure 1 --- fast5_fetcher.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fast5_fetcher.py b/fast5_fetcher.py index 42106de..9944c76 100644 --- a/fast5_fetcher.py +++ b/fast5_fetcher.py @@ -50,7 +50,7 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - +MyParser The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. @@ -453,8 +453,7 @@ def extract_file(args, path, filename): cmd = "tar -xf {} --transform='s/.*\///' -C {} {}".format( path, save_path, filename) else: - cmd = "cp {} {}".format(filename, os.path.join( - save_path, filename.split('/')[-1])) + cmd = "cp {} {}".format(filename, save_path) subprocess.call(cmd, shell=True, executable='/bin/bash') From 7de274b9514c43f63646d09960338969a58840c3 Mon Sep 17 00:00:00 2001 From: James Ferguson Date: Wed, 13 Feb 2019 23:45:24 +1100 Subject: [PATCH 11/11] SquiggleKit move notice --- README.md | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 8cab70f..27a871c 100644 --- a/README.md +++ b/README.md @@ -6,29 +6,34 @@ **fast5_fetcher** is a tool for fetching nanopore fast5 files to save time and simplify downstream analysis. + +## **fast5_fetcher is now part of SquiggleKit located [here](https://github.com/Psy-Fer/SquiggleKit)** +### Please use and cite SquiggleKit as it is the most up to date + + [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1413903.svg)](https://doi.org/10.5281/zenodo.1413903) ## Contents -- [Background](#Background) -- [Requirements](#Requirements) +- [Background](#background) +- [Requirements](#requirements) - [Installation](#installation) -- [Getting Started](<#Getting Started>) - - [File structures](<#File structures>) +- [Getting Started](<#getting started>) + - [File structures](<#file structures>) - [1. Raw structure (not preferred)](<#1. Raw structure>) - [2. Local basecalled structure](<#2. Local basecalled structure>) - [3. Parallel basecalled structure](<#3. Parallel basecalled structure>) - - [Inputs](#Inputs) + - [Inputs](#inputs) - [Instructions for use](<#Instructions for use>) - [Quick start](<#Quick start>) - [fast5_fetcher.py](#fast5_fetcher.py) - [Examples](#Examples) - [batch_tater.py](#batch_tater.py) -- [Acknowledgements](#Acknowledgements) -- [Cite](#Cite) -- [License](#License) +- [Acknowledgements](#acknowledgements) +- [Cite](#cite) +- [License](#license) # Background