refactored and added reference_type detection

nukappa · Dec 19, 2024 · fddf04f · fddf04f
1 parent e2eb4a2
commit fddf04f
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 21 deletions.
diff --git a/spacemake/cmdline.py b/spacemake/cmdline.py
@@ -1205,13 +1205,7 @@ def spacemake_migrate(args):
     # Make sure that the project-id and sample-id combination provided exists
     pdf.assert_sample(project_id, sample_id)
     project_folder = os.path.join('projects', project_id, 'processed_data', sample_id, 'illumina', 'complete_data')
-
-    # Extract vars from the config.yaml for later use
-    with open("config.yaml") as yamlfile:
-        cf = yaml.safe_load(yamlfile.read())
-    sample_species = pdf.get_sample_info(project_id, sample_id)['species']
-    genome_sequence = cf['species'][sample_species]['genome']['sequence']
-
+
     # Begin migration
     print('Beginning migration ...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
 
@@ -1227,7 +1221,7 @@ def spacemake_migrate(args):
         print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
               "not found on disk. Will generate them now.")
         # Execute code to convert to CRAM
-        convert_bam_to_cram(genome_sequence, project_folder, threads)
+        convert_bam_to_cram(project_id, sample_id, threads)
     else:
         print(f"CRAM files for sample with (project-id, sample-id)=({project_id}, {sample_id}) "
               "already on disk.")

diff --git a/spacemake/migrate.py b/spacemake/migrate.py
@@ -1,7 +1,9 @@
 import os
 import subprocess
 import time
+import yaml
 
+from spacemake.project_df import get_global_ProjectDF
 from spacemake.util import sync_timestamps
 
 
@@ -19,47 +21,78 @@ def find_bam_files(folder):
     bam_file_paths = [os.path.join(folder, f) for f in bam_files]
     are_symlinks = [os.path.islink(bam_file_path) for bam_file_path in bam_file_paths]
 
-    return list(zip(bam_files, are_symlinks))
+    return list(zip(bam_file_paths, are_symlinks))
 
 
-def convert_bam_to_cram(ref_sequence, project_folder, threads=4):
-    bam_files = find_bam_files(project_folder)
+def get_map_strategy_sequences(project_id, sample_id):
+    """
+    Returns a dictionary of reference_types and their location, e.g. {rRNA : /path/to/disk/sequence.fa}
+    """
+    pdf = get_global_ProjectDF()
+
+    map_strategy = pdf.get_sample_info(project_id, sample_id)['map_strategy']
+    sequence_type = [mapping.split(':')[1] for mapping in map_strategy.split('->')]
 
+    with open("config.yaml") as yamlfile:
+        cf = yaml.safe_load(yamlfile.read())
+    sample_species = pdf.get_sample_info(project_id, sample_id)['species']
+
+    reference_type = {st : cf['species'][sample_species][st]['sequence'] for st in sequence_type}
+
+    return reference_type
+
+
+def convert_bam_to_cram(project_id, sample_id, threads=4):
+    """
+    Converts all BAM files to CRAM and updates the timestamps to those of the
+    original files. Symbolic links are treated as such.
+    """
+    species_sequences = get_map_strategy_sequences(project_id, sample_id)
+
+    project_folder = os.path.join('projects', project_id, 'processed_data',
+                                  sample_id, 'illumina', 'complete_data')    
+    bam_files = find_bam_files(project_folder)
+
     for idx in range(len(bam_files)):
         bam_filename, bam_file_is_symlink = bam_files[idx]
         bam_filename_prefix = bam_filename.rsplit('.', 1)[0]
         cram_filename = bam_filename_prefix + '.cram'
 
-        if os.path.exists(os.path.join(project_folder, cram_filename)):
+        if os.path.exists(cram_filename):
             print('CRAM file', cram_filename, 'already exists. Skipping conversion.')
             continue
 
-        # TODO: change ref sequence for genome, rRNA, phiX, custom? Get it from map_strategy
+        if 'unaligned' in bam_filename:
+            continue
 
         if bam_file_is_symlink:
-            true_bam_filename = os.readlink(os.path.join(project_folder, bam_filename))
+            true_bam_filename = os.readlink(bam_filename)
             true_bam_filename_prefix = true_bam_filename.rsplit('.', 1)[0]
-            os.symlink(true_bam_filename_prefix + '.cram',
-                       os.path.join(project_folder, cram_filename))
+            os.symlink(true_bam_filename_prefix + '.cram', cram_filename)
         else:
             print('Converting', bam_filename, 'to', cram_filename, 
             '...', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
+
+            for ref_type in species_sequences:
+                if ref_type in bam_filename:
+                    ref_sequence = species_sequences[ref_type]
+                    break
+
             subprocess.run(
                 [
                     "samtools", "view",
                     "-T", ref_sequence,
                     "-C",
                     "--threads", str(threads),
-                    "-o", os.path.join(project_folder, cram_filename),
-                    os.path.join(project_folder, bam_filename)
+                    "-o", cram_filename,
+                    bam_filename
                 ]
             )
 
-        sync_timestamps(os.path.join(project_folder, bam_filename),
-                        os.path.join(project_folder, cram_filename))
+        sync_timestamps(bam_filename, cram_filename)
 
 
-def remove_files():
+def remove_files(project_folder):
     # - BAM files (only if CRAMs are present)
     bam_files = find_bam_files(project_folder)