From b3811c67e4981542bc965a5e3b46ba59ae7aecc1 Mon Sep 17 00:00:00 2001
From: nukappa <nukappa@users.noreply.github.com>
Date: Tue, 17 Dec 2024 17:37:29 +0100
Subject: [PATCH 1/6] basic parser for migrate utility. checks for project_id,
 sample_id combo

---
 spacemake/cmdline.py | 61 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/spacemake/cmdline.py b/spacemake/cmdline.py
index bd4e3c0..1897b38 100644
--- a/spacemake/cmdline.py
+++ b/spacemake/cmdline.py
@@ -774,6 +774,37 @@ def setup_run_parser(parent_parser_subparsers):
     return parser_run
 
 
+def setup_migrate_parser(parent_parser_subparsers):
+    """setup_migrate_parser
+
+    :param parent_parser_subparsers
+    """
+    parser_migrate = parent_parser_subparsers.add_parser(
+        "migrate",
+        help="migrate spacemake"
+    )
+
+    parser_migrate.add_argument(
+        "--project-id",
+        default="",
+        help="The project-id of the sample to perform the migration.",
+        type=str,
+        required=True,
+        dest="project_id",
+    )
+    parser_migrate.add_argument(
+        "--sample-id",
+        default="",
+        help="The sample-id of the sample to perform the migration.",
+        type=str,
+        required=True,
+        dest="sample_id",
+    )
+
+    parser_migrate.set_defaults(func=spacemake_migrate)
+
+    return parser_migrate
+
 #####################################################
 # actual command-line functions, used as call-backs #
 #####################################################
@@ -1145,6 +1176,25 @@ def list_projects_cmdline(args):
     # print the table
     logger.info(df.loc[:, variables].__str__())
 
+@message_aggregation(logger_name)
+def spacemake_migrate(args):
+    """spacemake_migrate.
+
+    :param args:
+    """
+    from spacemake.project_df import get_global_ProjectDF
+
+    pdf = get_global_ProjectDF()
+    
+    # Check that the project-id and sample-id combination provided exists
+    pdf.assert_sample(args['project_id'], args['sample_id'])
+
+    # TODO: convert BAM to CRAM, appropriately change timestamp
+
+    # TODO: delete BAMs
+
+    # TODO: delete other unnecessary files
+
 
 def make_main_parser():
     #################
@@ -1159,7 +1209,7 @@ def make_main_parser():
 
     parser_main = argparse.ArgumentParser(
         allow_abbrev=False,
-        description="spacemake: bioinformatic pipeline for processing and analysis of spatial-transcriptomics data",
+        description="Spacemake: processing and analysis of large-scale spatial transcriptomics data",
     )
 
     parser_main.add_argument("--version", action="store_true")
@@ -1172,6 +1222,7 @@ def make_main_parser():
     parser_projects = None
     parser_config = None
     parser_init = None
+    parser_migrate = None
     parser_spatial = None
 
     ##################
@@ -1194,7 +1245,7 @@ def make_main_parser():
         # SPACEMAKE PROJECT/SAMPLE #
         ############################
         from spacemake.cmdline import setup_project_parser
-
+ 
         parser_projects = setup_project_parser(parser_main_subparsers)
 
         #################
@@ -1202,6 +1253,11 @@ def make_main_parser():
         #################
         parser_run = setup_run_parser(parser_main_subparsers)
 
+        #####################
+        # SPACEMAKE MIGRATE #
+        #####################
+        parser_migrate = setup_migrate_parser(parser_main_subparsers)
+
         #####################
         # SPACEMAKE SPATIAL #
         #####################
@@ -1214,6 +1270,7 @@ def make_main_parser():
         "config": parser_config,
         "projects": parser_projects,
         "run": parser_run,
+        "migrate": parser_migrate,
         "main": parser_main,
         "spatial": parser_spatial,
     }

From a8835ea5b68862b3ac2210498b77a4a320f193c4 Mon Sep 17 00:00:00 2001
From: nukappa <nukappa@users.noreply.github.com>
Date: Wed, 18 Dec 2024 14:21:05 +0100
Subject: [PATCH 2/6] added CRAM conversion, several TODOs pending

---
 spacemake/cmdline.py | 70 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 8 deletions(-)

diff --git a/spacemake/cmdline.py b/spacemake/cmdline.py
index 1897b38..b39b6e9 100644
--- a/spacemake/cmdline.py
+++ b/spacemake/cmdline.py
@@ -787,7 +787,7 @@ def setup_migrate_parser(parent_parser_subparsers):
     parser_migrate.add_argument(
         "--project-id",
         default="",
-        help="The project-id of the sample to perform the migration.",
+        help="The project-id of the sample to perform the migration",
         type=str,
         required=True,
         dest="project_id",
@@ -795,11 +795,19 @@ def setup_migrate_parser(parent_parser_subparsers):
     parser_migrate.add_argument(
         "--sample-id",
         default="",
-        help="The sample-id of the sample to perform the migration.",
+        help="The sample-id of the sample to perform the migration",
         type=str,
         required=True,
         dest="sample_id",
     )
+    parser_migrate.add_argument(
+        "--threads",
+        default="1",
+        help="Number of threads to use",
+        type=str,
+        required=False,
+        dest="threads",
+    )
 
     parser_migrate.set_defaults(func=spacemake_migrate)
 
@@ -1176,6 +1184,7 @@ def list_projects_cmdline(args):
     # print the table
     logger.info(df.loc[:, variables].__str__())
 
+
 @message_aggregation(logger_name)
 def spacemake_migrate(args):
     """spacemake_migrate.
@@ -1183,17 +1192,62 @@ def spacemake_migrate(args):
     :param args:
     """
     from spacemake.project_df import get_global_ProjectDF
+    import subprocess
+    import time
+    import yaml
+
+    project_id = args['project_id']
+    sample_id = args['sample_id']
+    threads = args['threads']
 
     pdf = get_global_ProjectDF()
-    
-    # Check that the project-id and sample-id combination provided exists
-    pdf.assert_sample(args['project_id'], args['sample_id'])
 
-    # TODO: convert BAM to CRAM, appropriately change timestamp
+    # Make sure that the project-id and sample-id combination provided exists
+    pdf.assert_sample(project_id, sample_id)
+    project_folder = os.path.join('projects', project_id, 'processed_data', sample_id, 'illumina', 'complete_data')
+
+    # Extract vars from the config.yaml for later use
+    with open("config.yaml") as yamlfile:
+        cf = yaml.safe_load(yamlfile.read())
+    sample_species = pdf.get_sample_info(project_id, sample_id)['species']
+    genome_sequence = cf['species'][sample_species]['genome']['sequence']
+
+    # Start migrartion
+    print('Beginning migration ...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
+    if not os.path.exists(os.path.join(project_id, 'stats.csv')):
+        print(f"Stats file for sample with (project_id, sample_id)=({project_id}, {sample_id}) " 
+              "not found on disk. Will generate it now.")
+        # Execute code written elsewhere to generate the file
+    else:
+        print("Stats file found on disk")
+
+    if not os.path.exists(os.path.join(project_folder, 'final.cram')):
+        print(f"CRAM files for sample with (project_id, sample_id)=({project_id}, {sample_id}) "
+              "not found on disk. Will generate them now.")
+        # Execute code to convert to CRAM)
+        # TODO: reference BAM from internals OR write a func to find it
+        # TODO: proper naming for CRAM
+        # TODO: transfer timestamp
+        subprocess.run(
+            [
+                "samtools", "view",
+                "-T", genome_sequence,
+                "-C",
+                "--threads", str(threads),
+                "-o", os.path.join(project_folder, "final.cram"),
+                os.path.join(project_folder, "final.polyA_adapter_trimmed.bam")
+            ]
+        )
+    else:
+        print(f"CRAM files for sample with (project_id, sample_id)=({project_id}, {sample_id}) "
+              "already on disk. Skipping conversion step.")
 
-    # TODO: delete BAMs
+    print("Removing unnecessary files ...", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
+    # to delete:
+    # - BAM files (if CRAM are present)
+    # - unaligned.bam
 
-    # TODO: delete other unnecessary files
+    print("Migration complete ...", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
 
 
 def make_main_parser():

From c338f3a47153472c3cfa179a7a6a4a8453973c02 Mon Sep 17 00:00:00 2001
From: nukappa <nukappa@users.noreply.github.com>
Date: Thu, 19 Dec 2024 09:19:46 +0100
Subject: [PATCH 3/6] moved functions to a migration dedicated file

---
 spacemake/cmdline.py | 32 +++++++-----------------
 spacemake/migrate.py | 59 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 23 deletions(-)
 create mode 100644 spacemake/migrate.py

diff --git a/spacemake/cmdline.py b/spacemake/cmdline.py
index b39b6e9..c995b0f 100644
--- a/spacemake/cmdline.py
+++ b/spacemake/cmdline.py
@@ -1191,8 +1191,8 @@ def spacemake_migrate(args):
 
     :param args:
     """
+    from spacemake.migrate import convert_bam_to_cram
     from spacemake.project_df import get_global_ProjectDF
-    import subprocess
     import time
     import yaml
 
@@ -1212,40 +1212,26 @@ def spacemake_migrate(args):
     sample_species = pdf.get_sample_info(project_id, sample_id)['species']
     genome_sequence = cf['species'][sample_species]['genome']['sequence']
 
-    # Start migrartion
+    # Begin migration
     print('Beginning migration ...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
+    
     if not os.path.exists(os.path.join(project_id, 'stats.csv')):
-        print(f"Stats file for sample with (project_id, sample_id)=({project_id}, {sample_id}) " 
+        print(f"Stats file for sample with (project-id, sample-id)=({project_id}, {sample_id}) " 
               "not found on disk. Will generate it now.")
         # Execute code written elsewhere to generate the file
     else:
         print("Stats file found on disk")
 
     if not os.path.exists(os.path.join(project_folder, 'final.cram')):
-        print(f"CRAM files for sample with (project_id, sample_id)=({project_id}, {sample_id}) "
+        print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
               "not found on disk. Will generate them now.")
-        # Execute code to convert to CRAM)
-        # TODO: reference BAM from internals OR write a func to find it
-        # TODO: proper naming for CRAM
-        # TODO: transfer timestamp
-        subprocess.run(
-            [
-                "samtools", "view",
-                "-T", genome_sequence,
-                "-C",
-                "--threads", str(threads),
-                "-o", os.path.join(project_folder, "final.cram"),
-                os.path.join(project_folder, "final.polyA_adapter_trimmed.bam")
-            ]
-        )
+        # Execute code to convert to CRAM
+        convert_bam_to_cram(genome_sequence, project_folder, threads)
     else:
-        print(f"CRAM files for sample with (project_id, sample_id)=({project_id}, {sample_id}) "
-              "already on disk. Skipping conversion step.")
+        print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
+              "already on disk.")
 
     print("Removing unnecessary files ...", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
-    # to delete:
-    # - BAM files (if CRAM are present)
-    # - unaligned.bam
 
     print("Migration complete ...", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
 
diff --git a/spacemake/migrate.py b/spacemake/migrate.py
new file mode 100644
index 0000000..54b028a
--- /dev/null
+++ b/spacemake/migrate.py
@@ -0,0 +1,59 @@
+import os
+import subprocess
+import time
+
+def find_bam_files(folder):
+    """
+    Finds all .bam files in the given folder and checks if any of them is a symlink.
+    
+    Returns a list of tuples of type (str, bool), e.g. ('bam_file', False)
+    """
+    if not os.path.isdir(folder):
+        raise ValueError(f"The provided path {folder} is not a valid directory.")
+    
+    # Find files and check for symlinks
+    bam_files = [f for f in os.listdir(folder) if f.endswith('.bam')]
+    are_symlinks = [os.path.islink(bam_file) for bam_file in bam_files]
+
+    return list(zip(bam_files, are_symlinks))
+
+def convert_bam_to_cram(ref_sequence, project_folder, threads=4):
+    bam_files = find_bam_files(project_folder)
+
+    for idx in range(len(bam_files)):
+        bam_filename, bam_file_is_symlink = bam_files[idx]
+        bam_filename_prefix = bam_filename.rsplit('.', 1)[0]
+        cram_filename = bam_filename_prefix + ".cram"
+
+        # TODO: change ref sequence for genome, rRNA, phiX, custom?
+        
+        if bam_file_is_symlink:
+            # TODO: deal with this
+            continue
+        else:
+            print('Converting', bam_filename, 'to', cram_filename, 
+            '...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
+            subprocess.run(
+                [
+                    "samtools", "view",
+                    "-T", ref_sequence,
+                    "-C",
+                    "--threads", str(threads),
+                    "-o", os.path.join(project_folder, cram_filename),
+                    os.path.join(project_folder, bam_filename)
+                ]
+            )
+
+    # TODO: transfer timestamp
+
+    return
+
+def remove_files():
+    # - BAM files (if CRAMs are present)
+
+    # - unaligned.bam
+
+    # remove tiles
+
+    return
+

From 71dbe6a2333a0db662ca618a5e4393b9f541daa2 Mon Sep 17 00:00:00 2001
From: nukappa <nukappa@users.noreply.github.com>
Date: Thu, 19 Dec 2024 11:24:50 +0100
Subject: [PATCH 4/6] added transfer timestamp

---
 spacemake/migrate.py | 49 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/spacemake/migrate.py b/spacemake/migrate.py
index 54b028a..1f56930 100644
--- a/spacemake/migrate.py
+++ b/spacemake/migrate.py
@@ -2,6 +2,7 @@
 import subprocess
 import time
 
+
 def find_bam_files(folder):
     """
     Finds all .bam files in the given folder and checks if any of them is a symlink.
@@ -13,23 +14,54 @@ def find_bam_files(folder):
     
     # Find files and check for symlinks
     bam_files = [f for f in os.listdir(folder) if f.endswith('.bam')]
-    are_symlinks = [os.path.islink(bam_file) for bam_file in bam_files]
+    bam_file_paths = [os.path.join(folder, f) for f in bam_files]
+    are_symlinks = [os.path.islink(bam_file_path) for bam_file_path in bam_file_paths]
 
     return list(zip(bam_files, are_symlinks))
 
+
+def sync_timestamps(original_file, new_file):
+    """
+    Sync the timestamps (access and modification time) of new_file with those of original_file.
+    
+    Args:
+        original_file (str): Path to the file whose timestamps will be copied.
+        new_file (str): Path to the file that will have its timestamps updated.
+    """
+    try:
+        # Get the access time and modification time from original_file
+        source_times = os.stat(original_file)
+
+        # Set the same access and modification time for new_file
+        os.utime(new_file, (source_times.st_atime, source_times.st_mtime))
+
+        print(f"File timestamps of {new_file} set to match {original_file}.")
+    except FileNotFoundError:
+        print(f"Error: One or both of the files '{original_file}' or '{new_file}' do not exist.")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+
 def convert_bam_to_cram(ref_sequence, project_folder, threads=4):
     bam_files = find_bam_files(project_folder)
 
     for idx in range(len(bam_files)):
         bam_filename, bam_file_is_symlink = bam_files[idx]
         bam_filename_prefix = bam_filename.rsplit('.', 1)[0]
-        cram_filename = bam_filename_prefix + ".cram"
+        cram_filename = bam_filename_prefix + '.cram'
+
+        if os.path.exists(os.path.join(project_folder, cram_filename)):
+            print('CRAM file', cram_filename, 'already exists. Skipping conversion.')
+            continue
 
         # TODO: change ref sequence for genome, rRNA, phiX, custom?
         
         if bam_file_is_symlink:
-            # TODO: deal with this
-            continue
+            # TODO: fix timestamp for symlink 
+            true_bam_filename = os.readlink(os.path.join(project_folder, bam_filename))
+            true_bam_filename_prefix = true_bam_filename.rsplit('.', 1)[0]
+            os.symlink(true_bam_filename_prefix + '.cram',
+                       os.path.join(project_folder, bam_filename_prefix + '.cram'))
         else:
             print('Converting', bam_filename, 'to', cram_filename, 
             '...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
@@ -44,16 +76,15 @@ def convert_bam_to_cram(ref_sequence, project_folder, threads=4):
                 ]
             )
 
-    # TODO: transfer timestamp
+        sync_timestamps(os.path.join(project_folder, bam_filename),
+                        os.path.join(project_folder, cram_filename))
 
-    return
 
 def remove_files():
-    # - BAM files (if CRAMs are present)
+    # - BAM files (only if CRAMs are present)
+    bam_files = find_bam_files(project_folder)
 
     # - unaligned.bam
 
     # remove tiles
 
-    return
-

From e2eb4a20cae95314801e8c1ae18d52908649d1a2 Mon Sep 17 00:00:00 2001
From: nukappa <nukappa@users.noreply.github.com>
Date: Thu, 19 Dec 2024 14:25:56 +0100
Subject: [PATCH 5/6] moved sync_timestamps to util and fixed symlink bug

---
 spacemake/cmdline.py |  1 +
 spacemake/migrate.py | 29 ++++-------------------------
 spacemake/util.py    | 26 ++++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/spacemake/cmdline.py b/spacemake/cmdline.py
index c995b0f..4b81184 100644
--- a/spacemake/cmdline.py
+++ b/spacemake/cmdline.py
@@ -1223,6 +1223,7 @@ def spacemake_migrate(args):
         print("Stats file found on disk")
 
     if not os.path.exists(os.path.join(project_folder, 'final.cram')):
+        # TODO: fix this with a proper check.
         print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
               "not found on disk. Will generate them now.")
         # Execute code to convert to CRAM
diff --git a/spacemake/migrate.py b/spacemake/migrate.py
index 1f56930..84d3c93 100644
--- a/spacemake/migrate.py
+++ b/spacemake/migrate.py
@@ -2,6 +2,8 @@
 import subprocess
 import time
 
+from spacemake.util import sync_timestamps
+
 
 def find_bam_files(folder):
     """
@@ -20,28 +22,6 @@ def find_bam_files(folder):
     return list(zip(bam_files, are_symlinks))
 
 
-def sync_timestamps(original_file, new_file):
-    """
-    Sync the timestamps (access and modification time) of new_file with those of original_file.
-    
-    Args:
-        original_file (str): Path to the file whose timestamps will be copied.
-        new_file (str): Path to the file that will have its timestamps updated.
-    """
-    try:
-        # Get the access time and modification time from original_file
-        source_times = os.stat(original_file)
-
-        # Set the same access and modification time for new_file
-        os.utime(new_file, (source_times.st_atime, source_times.st_mtime))
-
-        print(f"File timestamps of {new_file} set to match {original_file}.")
-    except FileNotFoundError:
-        print(f"Error: One or both of the files '{original_file}' or '{new_file}' do not exist.")
-    except Exception as e:
-        print(f"An error occurred: {e}")
-
-
 def convert_bam_to_cram(ref_sequence, project_folder, threads=4):
     bam_files = find_bam_files(project_folder)
 
@@ -54,14 +34,13 @@ def convert_bam_to_cram(ref_sequence, project_folder, threads=4):
             print('CRAM file', cram_filename, 'already exists. Skipping conversion.')
             continue
 
-        # TODO: change ref sequence for genome, rRNA, phiX, custom?
+        # TODO: change ref sequence for genome, rRNA, phiX, custom? Get it from map_strategy
         
         if bam_file_is_symlink:
-            # TODO: fix timestamp for symlink 
             true_bam_filename = os.readlink(os.path.join(project_folder, bam_filename))
             true_bam_filename_prefix = true_bam_filename.rsplit('.', 1)[0]
             os.symlink(true_bam_filename_prefix + '.cram',
-                       os.path.join(project_folder, bam_filename_prefix + '.cram'))
+                       os.path.join(project_folder, cram_filename))
         else:
             print('Converting', bam_filename, 'to', cram_filename, 
             '...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
diff --git a/spacemake/util.py b/spacemake/util.py
index 7fdedf4..3bab109 100644
--- a/spacemake/util.py
+++ b/spacemake/util.py
@@ -578,3 +578,29 @@ def load_config_with_fallbacks(args, try_yaml="config.yaml"):
     import argparse
 
     return argparse.Namespace(**args_kw)
+
+
+def sync_timestamps(original_file, new_file):
+    """
+    Sync the timestamps (access and modification time) of new_file with those of original_file.
+    
+    Args:
+        original_file (str): Path to the file whose timestamps will be copied.
+        new_file (str): Path to the file that will have its timestamps updated.
+    """
+    try:
+        # Get the access time and modification time from original_file
+        if os.path.islink(original_file):
+            source_times = os.lstat(original_file)
+        else:
+            source_times = os.stat(original_file)
+
+        # Set the same access and modification time for new_file 
+        os.utime(new_file, (source_times.st_atime, source_times.st_mtime),
+                 follow_symlinks=not os.path.islink(original_file))
+      
+        print(f"File timestamps of {new_file} set to match {original_file}.")
+    except FileNotFoundError:
+        print(f"Error: One or both of the files '{original_file}' or '{new_file}' do not exist.")
+    except Exception as e:
+        print(f"An error occurred: {e}")

From fddf04f65b64995f994fa0a815681b39f4417578 Mon Sep 17 00:00:00 2001
From: nukappa <nukappa@users.noreply.github.com>
Date: Thu, 19 Dec 2024 17:37:41 +0100
Subject: [PATCH 6/6] refactored and added reference_type detection

---
 spacemake/cmdline.py | 10 ++------
 spacemake/migrate.py | 59 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/spacemake/cmdline.py b/spacemake/cmdline.py
index 4b81184..88e9ce9 100644
--- a/spacemake/cmdline.py
+++ b/spacemake/cmdline.py
@@ -1205,13 +1205,7 @@ def spacemake_migrate(args):
     # Make sure that the project-id and sample-id combination provided exists
     pdf.assert_sample(project_id, sample_id)
     project_folder = os.path.join('projects', project_id, 'processed_data', sample_id, 'illumina', 'complete_data')
-
-    # Extract vars from the config.yaml for later use
-    with open("config.yaml") as yamlfile:
-        cf = yaml.safe_load(yamlfile.read())
-    sample_species = pdf.get_sample_info(project_id, sample_id)['species']
-    genome_sequence = cf['species'][sample_species]['genome']['sequence']
-
+    
     # Begin migration
     print('Beginning migration ...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
     
@@ -1227,7 +1221,7 @@ def spacemake_migrate(args):
         print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
               "not found on disk. Will generate them now.")
         # Execute code to convert to CRAM
-        convert_bam_to_cram(genome_sequence, project_folder, threads)
+        convert_bam_to_cram(project_id, sample_id, threads)
     else:
         print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
               "already on disk.")
diff --git a/spacemake/migrate.py b/spacemake/migrate.py
index 84d3c93..3f0ced4 100644
--- a/spacemake/migrate.py
+++ b/spacemake/migrate.py
@@ -1,7 +1,9 @@
 import os
 import subprocess
 import time
+import yaml
 
+from spacemake.project_df import get_global_ProjectDF
 from spacemake.util import sync_timestamps
 
 
@@ -19,47 +21,78 @@ def find_bam_files(folder):
     bam_file_paths = [os.path.join(folder, f) for f in bam_files]
     are_symlinks = [os.path.islink(bam_file_path) for bam_file_path in bam_file_paths]
 
-    return list(zip(bam_files, are_symlinks))
+    return list(zip(bam_file_paths, are_symlinks))
 
 
-def convert_bam_to_cram(ref_sequence, project_folder, threads=4):
-    bam_files = find_bam_files(project_folder)
+def get_map_strategy_sequences(project_id, sample_id):
+    """
+    Returns a dictionary of reference_types and their location, e.g. {rRNA : /path/to/disk/sequence.fa}
+    """
+    pdf = get_global_ProjectDF()
+
+    map_strategy = pdf.get_sample_info(project_id, sample_id)['map_strategy']
+    sequence_type = [mapping.split(':')[1] for mapping in map_strategy.split('->')]
 
+    with open("config.yaml") as yamlfile:
+        cf = yaml.safe_load(yamlfile.read())
+    sample_species = pdf.get_sample_info(project_id, sample_id)['species']
+
+    reference_type = {st : cf['species'][sample_species][st]['sequence'] for st in sequence_type}
+
+    return reference_type
+     
+
+def convert_bam_to_cram(project_id, sample_id, threads=4):
+    """
+    Converts all BAM files to CRAM and updates the timestamps to those of the
+    original files. Symbolic links are treated as such.
+    """
+    species_sequences = get_map_strategy_sequences(project_id, sample_id)
+
+    project_folder = os.path.join('projects', project_id, 'processed_data',
+                                  sample_id, 'illumina', 'complete_data')    
+    bam_files = find_bam_files(project_folder)
+    
     for idx in range(len(bam_files)):
         bam_filename, bam_file_is_symlink = bam_files[idx]
         bam_filename_prefix = bam_filename.rsplit('.', 1)[0]
         cram_filename = bam_filename_prefix + '.cram'
 
-        if os.path.exists(os.path.join(project_folder, cram_filename)):
+        if os.path.exists(cram_filename):
             print('CRAM file', cram_filename, 'already exists. Skipping conversion.')
             continue
 
-        # TODO: change ref sequence for genome, rRNA, phiX, custom? Get it from map_strategy
+        if 'unaligned' in bam_filename:
+            continue
         
         if bam_file_is_symlink:
-            true_bam_filename = os.readlink(os.path.join(project_folder, bam_filename))
+            true_bam_filename = os.readlink(bam_filename)
             true_bam_filename_prefix = true_bam_filename.rsplit('.', 1)[0]
-            os.symlink(true_bam_filename_prefix + '.cram',
-                       os.path.join(project_folder, cram_filename))
+            os.symlink(true_bam_filename_prefix + '.cram', cram_filename)
         else:
             print('Converting', bam_filename, 'to', cram_filename, 
             '...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
+
+            for ref_type in species_sequences:
+                if ref_type in bam_filename:
+                    ref_sequence = species_sequences[ref_type]
+                    break
+
             subprocess.run(
                 [
                     "samtools", "view",
                     "-T", ref_sequence,
                     "-C",
                     "--threads", str(threads),
-                    "-o", os.path.join(project_folder, cram_filename),
-                    os.path.join(project_folder, bam_filename)
+                    "-o", cram_filename,
+                    bam_filename
                 ]
             )
 
-        sync_timestamps(os.path.join(project_folder, bam_filename),
-                        os.path.join(project_folder, cram_filename))
+        sync_timestamps(bam_filename, cram_filename)
 
 
-def remove_files():
+def remove_files(project_folder):
     # - BAM files (only if CRAMs are present)
     bam_files = find_bam_files(project_folder)