Skip to content

Commit

Permalink
refactored and added reference_type detection
Browse files Browse the repository at this point in the history
  • Loading branch information
nukappa committed Dec 19, 2024
1 parent e2eb4a2 commit fddf04f
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 21 deletions.
10 changes: 2 additions & 8 deletions spacemake/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,13 +1205,7 @@ def spacemake_migrate(args):
# Make sure that the project-id and sample-id combination provided exists
pdf.assert_sample(project_id, sample_id)
project_folder = os.path.join('projects', project_id, 'processed_data', sample_id, 'illumina', 'complete_data')

# Extract vars from the config.yaml for later use
with open("config.yaml") as yamlfile:
cf = yaml.safe_load(yamlfile.read())
sample_species = pdf.get_sample_info(project_id, sample_id)['species']
genome_sequence = cf['species'][sample_species]['genome']['sequence']


# Begin migration
print('Beginning migration ...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

Expand All @@ -1227,7 +1221,7 @@ def spacemake_migrate(args):
print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
"not found on disk. Will generate them now.")
# Execute code to convert to CRAM
convert_bam_to_cram(genome_sequence, project_folder, threads)
convert_bam_to_cram(project_id, sample_id, threads)
else:
print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
"already on disk.")
Expand Down
59 changes: 46 additions & 13 deletions spacemake/migrate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
import subprocess
import time
import yaml

from spacemake.project_df import get_global_ProjectDF
from spacemake.util import sync_timestamps


Expand All @@ -19,47 +21,78 @@ def find_bam_files(folder):
bam_file_paths = [os.path.join(folder, f) for f in bam_files]
are_symlinks = [os.path.islink(bam_file_path) for bam_file_path in bam_file_paths]

return list(zip(bam_files, are_symlinks))
return list(zip(bam_file_paths, are_symlinks))


def convert_bam_to_cram(ref_sequence, project_folder, threads=4):
bam_files = find_bam_files(project_folder)
def get_map_strategy_sequences(project_id, sample_id):
"""
Returns a dictionary of reference_types and their location, e.g. {rRNA : /path/to/disk/sequence.fa}
"""
pdf = get_global_ProjectDF()

map_strategy = pdf.get_sample_info(project_id, sample_id)['map_strategy']
sequence_type = [mapping.split(':')[1] for mapping in map_strategy.split('->')]

with open("config.yaml") as yamlfile:
cf = yaml.safe_load(yamlfile.read())
sample_species = pdf.get_sample_info(project_id, sample_id)['species']

reference_type = {st : cf['species'][sample_species][st]['sequence'] for st in sequence_type}

return reference_type


def convert_bam_to_cram(project_id, sample_id, threads=4):
"""
Converts all BAM files to CRAM and updates the timestamps to those of the
original files. Symbolic links are treated as such.
"""
species_sequences = get_map_strategy_sequences(project_id, sample_id)

project_folder = os.path.join('projects', project_id, 'processed_data',
sample_id, 'illumina', 'complete_data')
bam_files = find_bam_files(project_folder)

for idx in range(len(bam_files)):
bam_filename, bam_file_is_symlink = bam_files[idx]
bam_filename_prefix = bam_filename.rsplit('.', 1)[0]
cram_filename = bam_filename_prefix + '.cram'

if os.path.exists(os.path.join(project_folder, cram_filename)):
if os.path.exists(cram_filename):
print('CRAM file', cram_filename, 'already exists. Skipping conversion.')
continue

# TODO: change ref sequence for genome, rRNA, phiX, custom? Get it from map_strategy
if 'unaligned' in bam_filename:
continue

if bam_file_is_symlink:
true_bam_filename = os.readlink(os.path.join(project_folder, bam_filename))
true_bam_filename = os.readlink(bam_filename)
true_bam_filename_prefix = true_bam_filename.rsplit('.', 1)[0]
os.symlink(true_bam_filename_prefix + '.cram',
os.path.join(project_folder, cram_filename))
os.symlink(true_bam_filename_prefix + '.cram', cram_filename)
else:
print('Converting', bam_filename, 'to', cram_filename,
'...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

for ref_type in species_sequences:
if ref_type in bam_filename:
ref_sequence = species_sequences[ref_type]
break

subprocess.run(
[
"samtools", "view",
"-T", ref_sequence,
"-C",
"--threads", str(threads),
"-o", os.path.join(project_folder, cram_filename),
os.path.join(project_folder, bam_filename)
"-o", cram_filename,
bam_filename
]
)

sync_timestamps(os.path.join(project_folder, bam_filename),
os.path.join(project_folder, cram_filename))
sync_timestamps(bam_filename, cram_filename)


def remove_files():
def remove_files(project_folder):
# - BAM files (only if CRAMs are present)
bam_files = find_bam_files(project_folder)

Expand Down

0 comments on commit fddf04f

Please sign in to comment.