From d267513e8395a7320748a5b75983f0934f573ebc Mon Sep 17 00:00:00 2001 From: Pierre Pericard Date: Sun, 31 Mar 2024 09:51:52 +0200 Subject: [PATCH] improved fastq_umi_merge.py --- bin/fastq_umi_merge.py | 145 +++++++++++++++++++++++++---------------- 1 file changed, 88 insertions(+), 57 deletions(-) diff --git a/bin/fastq_umi_merge.py b/bin/fastq_umi_merge.py index 1668967..bbab4e8 100755 --- a/bin/fastq_umi_merge.py +++ b/bin/fastq_umi_merge.py @@ -3,55 +3,90 @@ import argparse import gzip +import time - -def open_fastq(fastq_filepath, read_write): +def open_fastq(fastq_filepath, mode): """ - Opens a FASTQ file for reading or writing. Determines if the file should be - treated as gzipped based on its extension. If opening for writing and the file - does not exist, it will be created. + Opens a FASTQ file, which can be optionally compressed with gzip. + + Parameters: + - fastq_filepath: Path to the FASTQ file. + - mode: Mode to open the file in ('r' for read, 'w' for write, etc.). + + Returns: + - File handle to the opened file. + + Raises: + - FileNotFoundError if the file does not exist. + - PermissionError if there is no permission to access the file. + - IOError for other IO related errors. """ - is_gzip = fastq_filepath.endswith(".gz") - mode = f"{read_write}t" - try: - if is_gzip: - file_handle = gzip.open(fastq_filepath, mode) - else: - file_handle = open(fastq_filepath, mode) - except IOError as e: + if fastq_filepath.endswith(".gz"): + # Handle gzip compressed files + return gzip.open(fastq_filepath, mode + "t") + # Handle uncompressed files + return open(fastq_filepath, mode) + except FileNotFoundError: + raise FileNotFoundError(f"File '{fastq_filepath}' not found.") + except PermissionError: + raise PermissionError(f"Permission denied when accessing '{fastq_filepath}'.") + except Exception as e: raise IOError(f"Failed to open '{fastq_filepath}' in mode '{mode}': {e}") - return file_handle - - def parse_fastq(fastq_filehandle): """ - Generator function to parse fastq files. Yields each read as a tuple + Generator function to parse fastq files, yielding each read as a tuple containing the header, sequence, and quality string. + + Parameters: + - fastq_filehandle: An open file handle to a FASTQ file. + + Yields: + - Tuple of (header, sequence, quality) for each read in the FASTQ file. """ - line_number = 0 - for line in fastq_filehandle: - stripped_line = line.strip() - if isinstance(line, bytes): # Handle binary mode for gzip - stripped_line = stripped_line.decode("utf-8") - if stripped_line: - line_number += 1 - if line_number % 4 == 1: - read_header = stripped_line[1:] - elif line_number % 4 == 2: - read_seq = stripped_line - elif line_number % 4 == 3: - continue # Skip the '+' line - elif line_number % 4 == 0: - read_qual = stripped_line - yield (read_header, read_seq, read_qual) + while True: + try: + # Attempt to read 4 lines at a time (FASTQ format standard) + lines = [next(fastq_filehandle).strip() for _ in range(4)] + except StopIteration: + break # End of file reached + # Yield the parsed components of each read + yield lines[0][1:], lines[1], lines[3] +def merge_sequences(args, input_fastq, umi_fastq, output_fastq): + """ + Merge sequences or headers from input and UMI FASTQ files based on user arguments. + + Parameters: + - args: Command line arguments specifying user preferences. + - input_fastq: File handle to the input FASTQ file. + - umi_fastq: File handle to the UMI FASTQ file. + - output_fastq: File handle for the output FASTQ file where the results are written. + """ + read_count = 0 + start_time = time.time() + for (input_header, input_seq, input_qual), (_, umi_seq, _) in zip(parse_fastq(input_fastq), parse_fastq(umi_fastq)): + if args.header: + # Append the UMI to the read header + output_header = f"{input_header.split()[0]}{args.delimiter}{umi_seq} {' '.join(input_header.split()[1:])}" + output_fastq.write(f"@{output_header}\n{input_seq}\n+\n{input_qual}\n") + else: + # Merge the UMI sequence with the input sequence + merged_seq = umi_seq + input_seq + output_fastq.write(f"@{input_header}\n{merged_seq}\n+\n{input_qual}\n") + + read_count += 1 + if read_count % 10000 == 0: # Update for every 10000 reads processed + elapsed_time = time.time() - start_time + # Work in kilo-reads for display + kreads_per_minute = (read_count / elapsed_time) * 60 / 1000 + kreads_processed = read_count / 1000 + print(f"Processed {kreads_processed:.2f} kilo-reads at {kreads_per_minute:.2f} kilo-reads/minute", end='\n') if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Merge a reads fastq file with the corresponding UMI fastq file. The UMI will be placed in 5' of the read sequence." - ) + parser = argparse.ArgumentParser(description="Merge FASTQ sequences with UMIs.") + # Definitions for command-line arguments with descriptions parser.add_argument( "-i", "--input", @@ -81,26 +116,22 @@ def parse_fastq(fastq_filehandle): action="store_true", help="append the UMI to the read header instead of merging it with the sequence", ) - + parser.add_argument( + "-d", + "--delimiter", + type=str, + default='_', + help="delimiter between the read name and the UMI", + ) args = parser.parse_args() - with open_fastq(args.input, "r") as input_fastq_filehandle, \ - open_fastq(args.umi, "r") as umi_fastq_filehandle, \ - open_fastq(args.output, "w") as output_fastq_filehandle: - - if args.header: - for (input_header, input_seq, input_qual), (umi_header, umi_seq, umi_qual) \ - in zip(parse_fastq(input_fastq_filehandle), parse_fastq(umi_fastq_filehandle)): - split_input_header = input_header.split() - output_header = f"{split_input_header[0]}_{umi_seq} {split_input_header[1:].join(' ')}" - output_fastq_filehandle.write( - f"@{output_header}\n{input_seq}\n+\n{input_qual}\n" - ) - else: - for (input_header, input_seq, input_qual), (umi_header, umi_seq, umi_qual) \ - in zip(parse_fastq(input_fastq_filehandle), parse_fastq(umi_fastq_filehandle)): - merged_seq = umi_seq + input_seq - merged_qual = umi_qual + input_qual - output_fastq_filehandle.write( - f"@{input_header}\n{merged_seq}\n+\n{merged_qual}\n" - ) + try: + # Open the input, UMI, and output files, handling them within a context manager + with open_fastq(args.input, "r") as input_fastq, \ + open_fastq(args.umi, "r") as umi_fastq, \ + open_fastq(args.output, "w") as output_fastq: + + # Merge sequences or headers based on user arguments + merge_sequences(args, input_fastq, umi_fastq, output_fastq) + except IOError as e: + print(f"Error: {e}")