Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lyveset fastq file parsing bugfix and other improvements #156

Merged
merged 6 commits into from
Aug 30, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 59 additions & 19 deletions tasks/phylogenetic_inference/task_lyveset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ task lyveset {
String docker_image = "us-docker.pkg.dev/general-theiagen/staphb/lyveset:1.1.4f"
Int memory = 64
Int cpu = 16
Int disk_size = 100
Int disk_size = 250
# Lyve-SET Parameters
##COMMON OPTIONS
##--allowedFlanking 0 allowed flanking distance in bp.
Expand Down Expand Up @@ -60,7 +60,7 @@ task lyveset {
Boolean fast = false
Boolean downsample = false
Boolean sample_sites = false
String? read_cleaner = 'CGP'
String read_cleaner = "CGP"
String? mapper
String? snpcaller
}
Expand All @@ -73,41 +73,81 @@ task lyveset {
read2_array=(~{sep=' ' read2})
read2_array_len=$(echo "${#read2[@]}")

if [ "$read1_array_len" -ne "$read2_index_array_len" ]; then
if [ "$read1_array_len" -ne "$read2_array_len" ]; then
echo "read1 array (length: $read1_array_len) and read2 index array (length: $read2_array_len) are of unequal length." >&2
exit 1
fi

# create lyvset project
set_manage.pl --create ~{dataset_name}

#shuffle paired end reads
for index in ${!read1_array[@]}; do
cp ${read1_array[$index]} . && cp ${read2_array[$index]} . # move reads to cwd
### PREFACE ###
# This FASTQ file re-naming strategy is necessary due to filename parsing in shuffleSplitReads.pl here: https://github.com/lskatz/lyve-SET/blob/v1.1.4f/scripts/shuffleSplitReads.pl#L34
# Curtis' interpretation of perl code:
# It first checks for a pattern like '_R1_' or '_R2_', and if found, sets the $readNumber variable to 1 or 2 respectively.
# If that pattern is not found, it checks for a pattern like '_1.f' or '_2.f', again setting the $readNumber accordingly.
# If neither pattern is matched, it raises an error indicating that the read number could not be parsed from the filename.
### END PREFACE ###

mkdir -v input-fastqs

# Firstly, rename read1 and read2 so that underscores are replaced with dashes except any underscores surrounding R1 or R2
# Also, place files within input-fastqs/ directory
echo "DEBUG: FASTQ file renaming. Replacing underscores with dashes, except underscores surrounding R1 or R2"
for FASTQ in "${!read1_array[@]}"; do
FASTQ_BASENAME=$(basename "${read1_array[$FASTQ]}")
# sed line replaces underscores with dashes, except surrounding R1 or R2
mv -v ${read1_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g')
done
# do the same for read2
for FASTQ in "${!read2_array[@]}"; do
FASTQ_BASENAME=$(basename "${read2_array[$FASTQ]}")
# sed line replaces underscores with dashes, except surrounding R1 or R2
mv -v ${read2_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g')
done

shuffleSplitReads.pl --numcpus ~{cpu} -o ./interleaved *.fastq.gz
### renaming FASTQs ending with _R1.fastq.gz or _R2.fastq.gz (i.e. those downloaded w/ SRA_Fetch or Basespace_Fetch wfs) ###
# read1
for FASTQ in input-fastqs/*; do
FASTQ_BASENAME=$(basename ${FASTQ})
# if the R1 FASTQ filenames end in "_R1.fastq.gz" rename the files to match lyveset naming convention
if [[ ${FASTQ} =~ _R1.fastq.gz$ ]]; then
echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}"
mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}"
# if the R2 FASTQ filenames end in "_R2.fastq.gz" rename the files to match lyveset naming convention
elif [[ ${FASTQ} =~ _R2.fastq.gz$ ]]; then
echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}"
mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}"
else
echo "DEBUG: did not detect any FASTQ files ending in _R1.fastq.gz or _R2.fastq.gz"
fi
done

echo "DEBUG: here's the final FASTQ filenames, prior to shuffling:"
ls -lh input-fastqs/
echo

# then moved into your project dir
mv ./interleaved/*.fastq.gz ~{dataset_name}/reads/
echo "DEBUG: merging R1 and R2 FASTQ files into interleaved FASTQ files with shuffleSplitReads.pl now..."
shuffleSplitReads.pl --numcpus ~{cpu} -o "./~{dataset_name}/reads" input-fastqs/*.fastq.gz

# cleanup
rmdir interleaved
mkdir ~{dataset_name}/ref/
cp ~{reference_genome} ~{dataset_name}/ref/reference.fasta
# make directory for reference genome and copy reference genome into it. Also rename to reference.fasta
mkdir -v ~{dataset_name}/ref/
cp -v ~{reference_genome} ~{dataset_name}/ref/reference.fasta

# launch lyveSET workflow now that everything is set up
launch_set.pl --numcpus ~{cpu} \
--allowedFlanking ~{allowedFlanking} \
--min_alt_frac ~{min_alt_frac} \
--min_coverage ~{min_coverage} \
~{'--presets ' + presets} \
~{true='--mask_phages' false='' mask_phages} \
~{true='--mask_cliffs' false='' mask_cliffs} \
~{true='--mask-phages' false='' mask_phages} \
~{true='--mask-cliffs' false='' mask_cliffs} \
~{true='--nomatrix' false='' nomatrix} \
~{true='--nomsa' false='' nomsa} \
~{true='--notrees' false='' notrees} \
~{true='--fast' false='' fast} \
~{true='--downsample' false='' downsample} \
~{true='--sample_sites' false='' sample_sites} \
~{true='--sample-sites' false='' sample_sites} \
~{'--read_cleaner ' + read_cleaner} \
~{'--mapper ' + mapper} \
~{'--snpcaller ' + snpcaller} \
Expand All @@ -126,11 +166,11 @@ task lyveset {
File? lyveset_pooled_snps_vcf = "~{dataset_name}/msa/out.pooled.snps.vcf.gz"
File? lyveset_filtered_matrix = "~{dataset_name}/msa/out.filteredMatrix.tsv"
File? lyveset_alignment_fasta = "~{dataset_name}/msa/out.aln.fas"
File? lyveset_reference_fasta = "~{dataset_name}/reference/reference.fasta"
File? lyveset_masked_regions = "~{dataset_name}/reference/maskedRegions.bed"
File? lyveset_reference_fasta = "~{dataset_name}/ref/reference.fasta"
File? lyveset_masked_regions = "~{dataset_name}/ref/maskedRegions.bed"
Array[File]? lyveset_msa_outputs = glob("~{dataset_name}/msa/out*")
Array[File]? lyveset_log_outputs = glob("~{dataset_name}/log/*")
Array[File]? lyveset_reference_outputs = glob("~{dataset_name}/reference/*")
Array[File]? lyveset_reference_outputs = glob("~{dataset_name}/ref/*")
Array[File]? lyveset_bam_outputs = glob("~{dataset_name}/bam/*.bam*")
Array[File]? lyveset_vcf_outputs = glob("~{dataset_name}/vcf/*.vcf*")
File lyveset_log = stdout()
Expand Down