Skip to content

Commit

Permalink
improved fastq_umi_merge.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ppericard committed Mar 31, 2024
1 parent 0641be8 commit d267513
Showing 1 changed file with 88 additions and 57 deletions.
145 changes: 88 additions & 57 deletions bin/fastq_umi_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,55 +3,90 @@

import argparse
import gzip
import time


def open_fastq(fastq_filepath, read_write):
def open_fastq(fastq_filepath, mode):
"""
Opens a FASTQ file for reading or writing. Determines if the file should be
treated as gzipped based on its extension. If opening for writing and the file
does not exist, it will be created.
Opens a FASTQ file, which can be optionally compressed with gzip.
Parameters:
- fastq_filepath: Path to the FASTQ file.
- mode: Mode to open the file in ('r' for read, 'w' for write, etc.).
Returns:
- File handle to the opened file.
Raises:
- FileNotFoundError if the file does not exist.
- PermissionError if there is no permission to access the file.
- IOError for other IO related errors.
"""
is_gzip = fastq_filepath.endswith(".gz")
mode = f"{read_write}t"

try:
if is_gzip:
file_handle = gzip.open(fastq_filepath, mode)
else:
file_handle = open(fastq_filepath, mode)
except IOError as e:
if fastq_filepath.endswith(".gz"):
# Handle gzip compressed files
return gzip.open(fastq_filepath, mode + "t")
# Handle uncompressed files
return open(fastq_filepath, mode)
except FileNotFoundError:
raise FileNotFoundError(f"File '{fastq_filepath}' not found.")
except PermissionError:
raise PermissionError(f"Permission denied when accessing '{fastq_filepath}'.")
except Exception as e:
raise IOError(f"Failed to open '{fastq_filepath}' in mode '{mode}': {e}")

return file_handle


def parse_fastq(fastq_filehandle):
"""
Generator function to parse fastq files. Yields each read as a tuple
Generator function to parse fastq files, yielding each read as a tuple
containing the header, sequence, and quality string.
Parameters:
- fastq_filehandle: An open file handle to a FASTQ file.
Yields:
- Tuple of (header, sequence, quality) for each read in the FASTQ file.
"""
line_number = 0
for line in fastq_filehandle:
stripped_line = line.strip()
if isinstance(line, bytes): # Handle binary mode for gzip
stripped_line = stripped_line.decode("utf-8")
if stripped_line:
line_number += 1
if line_number % 4 == 1:
read_header = stripped_line[1:]
elif line_number % 4 == 2:
read_seq = stripped_line
elif line_number % 4 == 3:
continue # Skip the '+' line
elif line_number % 4 == 0:
read_qual = stripped_line
yield (read_header, read_seq, read_qual)
while True:
try:
# Attempt to read 4 lines at a time (FASTQ format standard)
lines = [next(fastq_filehandle).strip() for _ in range(4)]
except StopIteration:
break # End of file reached
# Yield the parsed components of each read
yield lines[0][1:], lines[1], lines[3]

def merge_sequences(args, input_fastq, umi_fastq, output_fastq):
"""
Merge sequences or headers from input and UMI FASTQ files based on user arguments.
Parameters:
- args: Command line arguments specifying user preferences.
- input_fastq: File handle to the input FASTQ file.
- umi_fastq: File handle to the UMI FASTQ file.
- output_fastq: File handle for the output FASTQ file where the results are written.
"""
read_count = 0
start_time = time.time()
for (input_header, input_seq, input_qual), (_, umi_seq, _) in zip(parse_fastq(input_fastq), parse_fastq(umi_fastq)):
if args.header:
# Append the UMI to the read header
output_header = f"{input_header.split()[0]}{args.delimiter}{umi_seq} {' '.join(input_header.split()[1:])}"
output_fastq.write(f"@{output_header}\n{input_seq}\n+\n{input_qual}\n")
else:
# Merge the UMI sequence with the input sequence
merged_seq = umi_seq + input_seq
output_fastq.write(f"@{input_header}\n{merged_seq}\n+\n{input_qual}\n")

read_count += 1
if read_count % 10000 == 0: # Update for every 10000 reads processed
elapsed_time = time.time() - start_time
# Work in kilo-reads for display
kreads_per_minute = (read_count / elapsed_time) * 60 / 1000
kreads_processed = read_count / 1000
print(f"Processed {kreads_processed:.2f} kilo-reads at {kreads_per_minute:.2f} kilo-reads/minute", end='\n')

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Merge a reads fastq file with the corresponding UMI fastq file. The UMI will be placed in 5' of the read sequence."
)
parser = argparse.ArgumentParser(description="Merge FASTQ sequences with UMIs.")
# Definitions for command-line arguments with descriptions
parser.add_argument(
"-i",
"--input",
Expand Down Expand Up @@ -81,26 +116,22 @@ def parse_fastq(fastq_filehandle):
action="store_true",
help="append the UMI to the read header instead of merging it with the sequence",
)

parser.add_argument(
"-d",
"--delimiter",
type=str,
default='_',
help="delimiter between the read name and the UMI",
)
args = parser.parse_args()

with open_fastq(args.input, "r") as input_fastq_filehandle, \
open_fastq(args.umi, "r") as umi_fastq_filehandle, \
open_fastq(args.output, "w") as output_fastq_filehandle:

if args.header:
for (input_header, input_seq, input_qual), (umi_header, umi_seq, umi_qual) \
in zip(parse_fastq(input_fastq_filehandle), parse_fastq(umi_fastq_filehandle)):
split_input_header = input_header.split()
output_header = f"{split_input_header[0]}_{umi_seq} {split_input_header[1:].join(' ')}"
output_fastq_filehandle.write(
f"@{output_header}\n{input_seq}\n+\n{input_qual}\n"
)
else:
for (input_header, input_seq, input_qual), (umi_header, umi_seq, umi_qual) \
in zip(parse_fastq(input_fastq_filehandle), parse_fastq(umi_fastq_filehandle)):
merged_seq = umi_seq + input_seq
merged_qual = umi_qual + input_qual
output_fastq_filehandle.write(
f"@{input_header}\n{merged_seq}\n+\n{merged_qual}\n"
)
try:
# Open the input, UMI, and output files, handling them within a context manager
with open_fastq(args.input, "r") as input_fastq, \
open_fastq(args.umi, "r") as umi_fastq, \
open_fastq(args.output, "w") as output_fastq:

# Merge sequences or headers based on user arguments
merge_sequences(args, input_fastq, umi_fastq, output_fastq)
except IOError as e:
print(f"Error: {e}")

0 comments on commit d267513

Please sign in to comment.