rajewsky-lab · nukappa · Dec 17, 2024 · Dec 18, 2024 · Dec 19, 2024 · Dec 19, 2024
diff --git a/spacemake/cmdline.py b/spacemake/cmdline.py
@@ -774,6 +774,45 @@ def setup_run_parser(parent_parser_subparsers):
     return parser_run
 
 
+def setup_migrate_parser(parent_parser_subparsers):
+    """setup_migrate_parser
+
+    :param parent_parser_subparsers
+    """
+    parser_migrate = parent_parser_subparsers.add_parser(
+        "migrate",
+        help="migrate spacemake"
+    )
+
+    parser_migrate.add_argument(
+        "--project-id",
+        default="",
+        help="The project-id of the sample to perform the migration",
+        type=str,
+        required=True,
+        dest="project_id",
+    )
+    parser_migrate.add_argument(
+        "--sample-id",
+        default="",
+        help="The sample-id of the sample to perform the migration",
+        type=str,
+        required=True,
+        dest="sample_id",
+    )
+    parser_migrate.add_argument(
+        "--threads",
+        default="1",
+        help="Number of threads to use",
+        type=str,
+        required=False,
+        dest="threads",
+    )
+
+    parser_migrate.set_defaults(func=spacemake_migrate)
+
+    return parser_migrate
+
 #####################################################
 # actual command-line functions, used as call-backs #
 #####################################################
@@ -1146,6 +1185,52 @@ def list_projects_cmdline(args):
     logger.info(df.loc[:, variables].__str__())
 
 
+@message_aggregation(logger_name)
+def spacemake_migrate(args):
+    """spacemake_migrate.
+
+    :param args:
+    """
+    from spacemake.migrate import convert_bam_to_cram
+    from spacemake.project_df import get_global_ProjectDF
+    import time
+    import yaml
+
+    project_id = args['project_id']
+    sample_id = args['sample_id']
+    threads = args['threads']
+
+    pdf = get_global_ProjectDF()
+
+    # Make sure that the project-id and sample-id combination provided exists
+    pdf.assert_sample(project_id, sample_id)
+    project_folder = os.path.join('projects', project_id, 'processed_data', sample_id, 'illumina', 'complete_data')
+
+    # Begin migration
+    print('Beginning migration ...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
+
+    if not os.path.exists(os.path.join(project_id, 'stats.csv')):
+        print(f"Stats file for sample with (project-id, sample-id)=({project_id}, {sample_id}) " 
+              "not found on disk. Will generate it now.")
+        # Execute code written elsewhere to generate the file
+    else:
+        print("Stats file found on disk")
+
+    if not os.path.exists(os.path.join(project_folder, 'final.cram')):
+        # TODO: fix this with a proper check.
+        print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
+              "not found on disk. Will generate them now.")
+        # Execute code to convert to CRAM
+        convert_bam_to_cram(project_id, sample_id, threads)
+    else:
+        print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
+              "already on disk.")
+
+    print("Removing unnecessary files ...", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
+
+    print("Migration complete ...", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
+
+
 def make_main_parser():
     #################
     # DEFINE PARSER #
@@ -1159,7 +1244,7 @@ def make_main_parser():
 
     parser_main = argparse.ArgumentParser(
         allow_abbrev=False,
-        description="spacemake: bioinformatic pipeline for processing and analysis of spatial-transcriptomics data",
+        description="Spacemake: processing and analysis of large-scale spatial transcriptomics data",
     )
 
     parser_main.add_argument("--version", action="store_true")
@@ -1172,6 +1257,7 @@ def make_main_parser():
     parser_projects = None
     parser_config = None
     parser_init = None
+    parser_migrate = None
     parser_spatial = None
 
     ##################
@@ -1194,14 +1280,19 @@ def make_main_parser():
         # SPACEMAKE PROJECT/SAMPLE #
         ############################
         from spacemake.cmdline import setup_project_parser
-
+ 
         parser_projects = setup_project_parser(parser_main_subparsers)
 
         #################
         # SPACEMAKE RUN #
         #################
         parser_run = setup_run_parser(parser_main_subparsers)
 
+        #####################
+        # SPACEMAKE MIGRATE #
+        #####################
+        parser_migrate = setup_migrate_parser(parser_main_subparsers)
+
         #####################
         # SPACEMAKE SPATIAL #
         #####################
@@ -1214,6 +1305,7 @@ def make_main_parser():
         "config": parser_config,
         "projects": parser_projects,
         "run": parser_run,
+        "migrate": parser_migrate,
         "main": parser_main,
         "spatial": parser_spatial,
     }

diff --git a/spacemake/migrate.py b/spacemake/migrate.py
@@ -0,0 +1,102 @@
+import os
+import subprocess
+import time
+import yaml
+
+from spacemake.project_df import get_global_ProjectDF
+from spacemake.util import sync_timestamps
+
+
+def find_bam_files(folder):
+    """
+    Finds all .bam files in the given folder and checks if any of them is a symlink.
+
+    Returns a list of tuples of type (str, bool), e.g. ('bam_file', False)
+    """
+    if not os.path.isdir(folder):
+        raise ValueError(f"The provided path {folder} is not a valid directory.")
+
+    # Find files and check for symlinks
+    bam_files = [f for f in os.listdir(folder) if f.endswith('.bam')]
+    bam_file_paths = [os.path.join(folder, f) for f in bam_files]
+    are_symlinks = [os.path.islink(bam_file_path) for bam_file_path in bam_file_paths]
+
+    return list(zip(bam_file_paths, are_symlinks))
+
+
+def get_map_strategy_sequences(project_id, sample_id):
+    """
+    Returns a dictionary of reference_types and their location, e.g. {rRNA : /path/to/disk/sequence.fa}
+    """
+    pdf = get_global_ProjectDF()
+
+    map_strategy = pdf.get_sample_info(project_id, sample_id)['map_strategy']
+    sequence_type = [mapping.split(':')[1] for mapping in map_strategy.split('->')]
+
+    with open("config.yaml") as yamlfile:
+        cf = yaml.safe_load(yamlfile.read())
+    sample_species = pdf.get_sample_info(project_id, sample_id)['species']
+
+    reference_type = {st : cf['species'][sample_species][st]['sequence'] for st in sequence_type}
+
+    return reference_type
+
+
+def convert_bam_to_cram(project_id, sample_id, threads=4):
+    """
+    Converts all BAM files to CRAM and updates the timestamps to those of the
+    original files. Symbolic links are treated as such.
+    """
+    species_sequences = get_map_strategy_sequences(project_id, sample_id)
+
+    project_folder = os.path.join('projects', project_id, 'processed_data',
+                                  sample_id, 'illumina', 'complete_data')    
+    bam_files = find_bam_files(project_folder)
+
+    for idx in range(len(bam_files)):
+        bam_filename, bam_file_is_symlink = bam_files[idx]
+        bam_filename_prefix = bam_filename.rsplit('.', 1)[0]
+        cram_filename = bam_filename_prefix + '.cram'
+
+        if os.path.exists(cram_filename):
+            print('CRAM file', cram_filename, 'already exists. Skipping conversion.')
+            continue
+
+        if 'unaligned' in bam_filename:
+            continue
+
+        if bam_file_is_symlink:
+            true_bam_filename = os.readlink(bam_filename)
+            true_bam_filename_prefix = true_bam_filename.rsplit('.', 1)[0]
+            os.symlink(true_bam_filename_prefix + '.cram', cram_filename)
+        else:
+            print('Converting', bam_filename, 'to', cram_filename, 
+            '...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
+
+            for ref_type in species_sequences:
+                if ref_type in bam_filename:
+                    ref_sequence = species_sequences[ref_type]
+                    break
+
+            subprocess.run(
+                [
+                    "samtools", "view",
+                    "-T", ref_sequence,
+                    "-C",
+                    "--threads", str(threads),
+                    "-o", cram_filename,
+                    bam_filename
+                ]
+            )
+
+        sync_timestamps(bam_filename, cram_filename)
+
+
+def remove_files(project_folder):
+    # - BAM files (only if CRAMs are present)
+    bam_files = find_bam_files(project_folder)
+
+    # - unaligned.bam
+
+    # remove tiles
+
diff --git a/spacemake/util.py b/spacemake/util.py
@@ -578,3 +578,29 @@ def load_config_with_fallbacks(args, try_yaml="config.yaml"):
     import argparse
 
     return argparse.Namespace(**args_kw)
+
+
+def sync_timestamps(original_file, new_file):
+    """
+    Sync the timestamps (access and modification time) of new_file with those of original_file.
+
+    Args:
+        original_file (str): Path to the file whose timestamps will be copied.
+        new_file (str): Path to the file that will have its timestamps updated.
+    """
+    try:
+        # Get the access time and modification time from original_file
+        if os.path.islink(original_file):
+            source_times = os.lstat(original_file)
+        else:
+            source_times = os.stat(original_file)
+
+        # Set the same access and modification time for new_file 
+        os.utime(new_file, (source_times.st_atime, source_times.st_mtime),
+                 follow_symlinks=not os.path.islink(original_file))
+
+        print(f"File timestamps of {new_file} set to match {original_file}.")
+    except FileNotFoundError:
+        print(f"Error: One or both of the files '{original_file}' or '{new_file}' do not exist.")
+    except Exception as e:
+        print(f"An error occurred: {e}")