Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tool for migrating between versions #129

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 94 additions & 2 deletions spacemake/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,45 @@ def setup_run_parser(parent_parser_subparsers):
return parser_run


def setup_migrate_parser(parent_parser_subparsers):
"""setup_migrate_parser

:param parent_parser_subparsers
"""
parser_migrate = parent_parser_subparsers.add_parser(
"migrate",
help="migrate spacemake"
)

parser_migrate.add_argument(
"--project-id",
default="",
help="The project-id of the sample to perform the migration",
type=str,
required=True,
dest="project_id",
)
parser_migrate.add_argument(
"--sample-id",
default="",
help="The sample-id of the sample to perform the migration",
type=str,
required=True,
dest="sample_id",
)
parser_migrate.add_argument(
"--threads",
default="1",
help="Number of threads to use",
type=str,
required=False,
dest="threads",
)

parser_migrate.set_defaults(func=spacemake_migrate)

return parser_migrate

#####################################################
# actual command-line functions, used as call-backs #
#####################################################
Expand Down Expand Up @@ -1146,6 +1185,52 @@ def list_projects_cmdline(args):
logger.info(df.loc[:, variables].__str__())


@message_aggregation(logger_name)
def spacemake_migrate(args):
"""spacemake_migrate.

:param args:
"""
from spacemake.migrate import convert_bam_to_cram
from spacemake.project_df import get_global_ProjectDF
import time
import yaml

project_id = args['project_id']
sample_id = args['sample_id']
threads = args['threads']

pdf = get_global_ProjectDF()

# Make sure that the project-id and sample-id combination provided exists
pdf.assert_sample(project_id, sample_id)
project_folder = os.path.join('projects', project_id, 'processed_data', sample_id, 'illumina', 'complete_data')

# Begin migration
print('Beginning migration ...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

if not os.path.exists(os.path.join(project_id, 'stats.csv')):
print(f"Stats file for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
"not found on disk. Will generate it now.")
# Execute code written elsewhere to generate the file
else:
print("Stats file found on disk")

if not os.path.exists(os.path.join(project_folder, 'final.cram')):
# TODO: fix this with a proper check.
print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
"not found on disk. Will generate them now.")
# Execute code to convert to CRAM
convert_bam_to_cram(project_id, sample_id, threads)
else:
print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
"already on disk.")

print("Removing unnecessary files ...", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

print("Migration complete ...", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))


def make_main_parser():
#################
# DEFINE PARSER #
Expand All @@ -1159,7 +1244,7 @@ def make_main_parser():

parser_main = argparse.ArgumentParser(
allow_abbrev=False,
description="spacemake: bioinformatic pipeline for processing and analysis of spatial-transcriptomics data",
description="Spacemake: processing and analysis of large-scale spatial transcriptomics data",
)

parser_main.add_argument("--version", action="store_true")
Expand All @@ -1172,6 +1257,7 @@ def make_main_parser():
parser_projects = None
parser_config = None
parser_init = None
parser_migrate = None
parser_spatial = None

##################
Expand All @@ -1194,14 +1280,19 @@ def make_main_parser():
# SPACEMAKE PROJECT/SAMPLE #
############################
from spacemake.cmdline import setup_project_parser

parser_projects = setup_project_parser(parser_main_subparsers)

#################
# SPACEMAKE RUN #
#################
parser_run = setup_run_parser(parser_main_subparsers)

#####################
# SPACEMAKE MIGRATE #
#####################
parser_migrate = setup_migrate_parser(parser_main_subparsers)

#####################
# SPACEMAKE SPATIAL #
#####################
Expand All @@ -1214,6 +1305,7 @@ def make_main_parser():
"config": parser_config,
"projects": parser_projects,
"run": parser_run,
"migrate": parser_migrate,
"main": parser_main,
"spatial": parser_spatial,
}
Expand Down
102 changes: 102 additions & 0 deletions spacemake/migrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import os
import subprocess
import time
import yaml

from spacemake.project_df import get_global_ProjectDF
from spacemake.util import sync_timestamps


def find_bam_files(folder):
"""
Finds all .bam files in the given folder and checks if any of them is a symlink.

Returns a list of tuples of type (str, bool), e.g. ('bam_file', False)
"""
if not os.path.isdir(folder):
raise ValueError(f"The provided path {folder} is not a valid directory.")

# Find files and check for symlinks
bam_files = [f for f in os.listdir(folder) if f.endswith('.bam')]
bam_file_paths = [os.path.join(folder, f) for f in bam_files]
are_symlinks = [os.path.islink(bam_file_path) for bam_file_path in bam_file_paths]

return list(zip(bam_file_paths, are_symlinks))


def get_map_strategy_sequences(project_id, sample_id):
"""
Returns a dictionary of reference_types and their location, e.g. {rRNA : /path/to/disk/sequence.fa}
"""
pdf = get_global_ProjectDF()

map_strategy = pdf.get_sample_info(project_id, sample_id)['map_strategy']
sequence_type = [mapping.split(':')[1] for mapping in map_strategy.split('->')]

with open("config.yaml") as yamlfile:
cf = yaml.safe_load(yamlfile.read())
sample_species = pdf.get_sample_info(project_id, sample_id)['species']

reference_type = {st : cf['species'][sample_species][st]['sequence'] for st in sequence_type}

return reference_type


def convert_bam_to_cram(project_id, sample_id, threads=4):
"""
Converts all BAM files to CRAM and updates the timestamps to those of the
original files. Symbolic links are treated as such.
"""
species_sequences = get_map_strategy_sequences(project_id, sample_id)

project_folder = os.path.join('projects', project_id, 'processed_data',
sample_id, 'illumina', 'complete_data')
bam_files = find_bam_files(project_folder)

for idx in range(len(bam_files)):
bam_filename, bam_file_is_symlink = bam_files[idx]
bam_filename_prefix = bam_filename.rsplit('.', 1)[0]
cram_filename = bam_filename_prefix + '.cram'

if os.path.exists(cram_filename):
print('CRAM file', cram_filename, 'already exists. Skipping conversion.')
continue

if 'unaligned' in bam_filename:
continue

if bam_file_is_symlink:
true_bam_filename = os.readlink(bam_filename)
true_bam_filename_prefix = true_bam_filename.rsplit('.', 1)[0]
os.symlink(true_bam_filename_prefix + '.cram', cram_filename)
else:
print('Converting', bam_filename, 'to', cram_filename,
'...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

for ref_type in species_sequences:
if ref_type in bam_filename:
ref_sequence = species_sequences[ref_type]
break

subprocess.run(
[
"samtools", "view",
"-T", ref_sequence,
"-C",
"--threads", str(threads),
"-o", cram_filename,
bam_filename
]
)

sync_timestamps(bam_filename, cram_filename)


def remove_files(project_folder):
# - BAM files (only if CRAMs are present)
bam_files = find_bam_files(project_folder)

# - unaligned.bam

# remove tiles

26 changes: 26 additions & 0 deletions spacemake/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,3 +578,29 @@ def load_config_with_fallbacks(args, try_yaml="config.yaml"):
import argparse

return argparse.Namespace(**args_kw)


def sync_timestamps(original_file, new_file):
"""
Sync the timestamps (access and modification time) of new_file with those of original_file.

Args:
original_file (str): Path to the file whose timestamps will be copied.
new_file (str): Path to the file that will have its timestamps updated.
"""
try:
# Get the access time and modification time from original_file
if os.path.islink(original_file):
source_times = os.lstat(original_file)
else:
source_times = os.stat(original_file)

# Set the same access and modification time for new_file
os.utime(new_file, (source_times.st_atime, source_times.st_mtime),
follow_symlinks=not os.path.islink(original_file))

print(f"File timestamps of {new_file} set to match {original_file}.")
except FileNotFoundError:
print(f"Error: One or both of the files '{original_file}' or '{new_file}' do not exist.")
except Exception as e:
print(f"An error occurred: {e}")