diff --git a/Fast5Pipeline b/Fast5Pipeline new file mode 100755 index 0000000..e7b0fb0 --- /dev/null +++ b/Fast5Pipeline @@ -0,0 +1,162 @@ +#!/usr/bin/env bash +# -*- coding: utf-8 -*- +# Created on Fri June 23 14:31:04 2023 +# @author: AlbertRockG + +# Function to display usage information +usage() { + echo "Usage: $0 -c -i -o -k -m -b " + echo "Options:" + echo " -c, --guppy-config Guppy config file" + echo " -i, --input-dir Input directory" + echo " -o, --output-dir Output directory: different from the input directory" + echo " -k, --barcode-kits Barcode kits" + echo " -m, --medaka-model Medaka model: only high accuracy models supported" + echo " -b, --batchsize GPU memory control: controls memory use (default: 100)." + exit 1 +} + +# Function to concatenate fastq.gz files in a subfolder +concatenate_fastq() { + local subfolder=$1 + + echo "Processing subfolder: $subfolder" + + # Create the "joined" folder in the parent directory + parent_dir=$(dirname "$subfolder") + joined_folder="$parent_dir/joined" + mkdir -p "$joined_folder" + + # Create output file name based on subfolder name + output_file="$joined_folder/$(basename "$subfolder").fastq.gz" + + # Concatenate fastq.gz files into the output file + gunzip -c "$subfolder"/*.fastq.gz | gzip -c > "$output_file" + + echo "Concatenated fastq.gz files into: $output_file" + echo "" +} + +# Function to run flye on a joined_fastq file +run_fly() { + local input_file=$1 + + echo "Processing file: $input_file" + + # Create the "assembled" folder in the parent directory + parent_dir="${input_file%%/*}" + assembly_folder="$parent_dir/assembled" + mkdir -p "$assembly_folder" + + # Create output file name based on subfolder name + output="$assembly_folder/$(basename "$input_file")" + + # Run flye on the input file + flye -t "$(nproc)" --out-dir "$output" --nano-hq "$input_file" + + echo "Flye analysis completed for: $input_file" + echo "" +} + +# Function to run flye on a joined_fastq file +run_medaka() { + local input_file=$1 + + echo "Processing file: $input_file" + + # Create the "polished" folder in the parent directory + main_dir="${input_file%%/*}" + assembly_dir="$(basename "$input_file")" + polished_folder="$main_dir/polished" + mkdir -p "$polished_folder" + + # Create output dir name based on the input file name + output="$polished_folder/$(basename "$input_file")" + + # Run medaka on the input file + medaka_consensus -i "$input_file" \ + -d "$main_dir"/assembled/"$assembly_dir"/assembly.fasta \ + -o "$output" -m "$medaka_model" -t "$(nproc)" -b "$batchsize" + + echo "Medaka analysis completed for: $input_file" + echo "" +} + + +# Parse command-line options +while getopts ":c:i:o:k:m:b:" opt; do + case "${opt}" in + c) + guppy_config=${OPTARG} + ;; + i) + input_dir=${OPTARG} + ;; + o) + output_dir=${OPTARG} + ;; + k) + barcode_kits=${OPTARG} + ;; + m) + medaka_model=${OPTARG} + ;; + b) + batchsize=${OPTARG} + ;; + *) + usage + ;; + esac +done + +# Check if all required options are provided +if [[ -z $guppy_config || -z $input_dir || -z $output_dir || -z $barcode_kits || -z $medaka_model ]]; then + echo "Error: Missing required options" + usage +fi + +# Step 1: Run guppy_basecaller +guppy_basecaller --disable_pings -x auto \ + -c "$guppy_config" \ + -i "$input_dir" --recursive \ + -s "$output_dir/basecalled" + +# Step 2: Run guppy_barcoder +guppy_barcoder -t "$(nproc)" --disable_pings -x auto \ + --barcode_kits "$barcode_kits" --enable_trim_barcodes \ + -i "$output_dir/basecalled" --recursive \ + -s "$output_dir/trimmed" --compress_fastq + +# Step 3: Join fastq.gz files + +# Loop through subfolders in the input folder +for subfolder in "$output_dir"/trimmed/*; do + if [[ -d $subfolder ]]; then + concatenate_fastq "$subfolder" + fi +done + +# Step 4: Run flye +for input_file in "$output_dir"/joined/*.fastq.gz; do + if [[ -f $input_file ]]; then + run_flye "$input_file" + fi +done + +# Step 6: Run medaka_consensus +conda_env=medaka + +if conda info --envs | grep -wq "$conda_env"; then + # Activate conda environment + conda activate medaka + + for input_file in "$output_dir"/joined/*.fastq.gz; do + if [[ -f $input_file ]]; then + run_medaka "$input_file" + fi + done + + # Deactivate conda environment + conda deactivate +fi