Added progress to layout.py and ml.py.

Created shared variable to keep track of time and iterations, in order to be able to show progress, time spent and estimated time remaining. Added an error handling where figure.py whould crash when trying to calculate a regression when all values where identical (more likely to happen at the beggining of the chromosome). Added a check to recalculate the LMI when the user adds another signal succeding the compute of KK and LMI with less signals. This way, the user does not need to create a new working directory and compute everything again.
3DGenomes · Jul 13, 2023 · 9611d57 · 9611d57
1 parent b670e49
commit 9611d57
Show file tree

Hide file tree

Showing 8 changed files with 834 additions and 75 deletions.
diff --git a/LICENSE b/LICENSE
diff --git a/metaloci/VERSION b/metaloci/VERSION
@@ -1 +1 @@
-0.1.24
+0.2.3
diff --git a/metaloci/plot/plot.py b/metaloci/plot/plot.py
@@ -334,29 +334,37 @@ def get_lmi_scatterplot(
     x = zscore(lmi_geometry.signal)
     y = zscore(y_lag)
 
-    _, _, r_value_scat, p_value_scat, _ = linregress(x, y)
-    scatter, ax = plt.subplots(figsize=(5, 5))  # , subplot_kw={'aspect':'equal'})
+    try:
 
-    alpha_sp = [1.0 if val < signipval else 0.1 for val in lmi_geometry.LMI_pvalue]
-    colors_sp = [colors_lmi[val] for val in lmi_geometry.moran_quadrant]
+        _, _, r_value_scat, p_value_scat, _ = linregress(x, y)
+        scatter, ax = plt.subplots(figsize=(5, 5))  # , subplot_kw={'aspect':'equal'})
 
-    plt.scatter(x=x, y=y, s=100, ec="white", fc=colors_sp, alpha=alpha_sp)
+        alpha_sp = [1.0 if val < signipval else 0.1 for val in lmi_geometry.LMI_pvalue]
+        colors_sp = [colors_lmi[val] for val in lmi_geometry.moran_quadrant]
 
-    sns.scatterplot(
-        x=[x[mlobject.poi]], y=[y[mlobject.poi]], s=150, ec="lime", fc="none", zorder=len(lmi_geometry)
-    )
-    sns.regplot(x=x, y=y, scatter=False, color="k")
-    sns.despine(top=True, right=True, left=False, bottom=False, offset=10, trim=False)
+        plt.scatter(x=x, y=y, s=100, ec="white", fc=colors_sp, alpha=alpha_sp)
+
+        sns.scatterplot(
+            x=[x[mlobject.poi]], y=[y[mlobject.poi]], s=150, ec="lime", fc="none", zorder=len(lmi_geometry)
+        )
+        sns.regplot(x=x, y=y, scatter=False, color="k")
+        sns.despine(top=True, right=True, left=False, bottom=False, offset=10, trim=False)
+
+        plt.title(f"Moran Local Scatterplot\nr: {r_value_scat:4.2f}   p-value: {p_value_scat:.1e}")
+        plt.axvline(x=0, color="k", linestyle=":")
+        plt.axhline(y=0, color="k", linestyle=":")
+
+        ax.set_xlabel("Z-score(Signal)")
+        ax.set_ylabel("Z-score(Signal Spatial Lag)")
 
-    plt.title(f"Moran Local Scatterplot\nr: {r_value_scat:4.2f}   p-value: {p_value_scat:.1e}")
-    plt.axvline(x=0, color="k", linestyle=":")
-    plt.axhline(y=0, color="k", linestyle=":")
+        r_value_scat = float(r_value_scat)
+        p_value_scat = float(r_value_scat)
+
+    except ValueError:
 
-    ax.set_xlabel("Z-score(Signal)")
-    ax.set_ylabel("Z-score(Signal Spatial Lag)")
+        print("\t\tCannot compute lineal regression as all values are identical.")
 
-    r_value_scat = float(r_value_scat)
-    p_value_scat = float(r_value_scat)
+        return None, None, None
 
     return scatter, r_value_scat, p_value_scat
 

diff --git a/metaloci/tools/figure.py b/metaloci/tools/figure.py
@@ -166,10 +166,15 @@ def run(opts):
 
         except FileNotFoundError:
 
-            print("\n\t.mlo file not found for this region. \n\tSkipping to next region.")
+            print("\n\t.mlo file not found for this region. \n\tSkipping to the next one.")
 
             continue
 
+        if mlobject.lmi_info == None:
+
+            print("\n\tLMI not calculated for this region. \n\tSkipping to the next one...")
+            continue        
+
         buffer = mlobject.kk_distances.diagonal(1).mean() * INFLUENCE
 
         bins = []
@@ -246,14 +251,7 @@ def run(opts):
             plt.close()
             print("\t\tGaudi Type plot -> done.")
 
-            print("\t\tLMI Scatter plot", end="\r")
-            lmi_plt, r_value, p_value = plot.get_lmi_scatterplot(
-                mlobject, merged_lmi_geometry, buffer * BFACT, signipval, colors
-            )
-            lmi_plt.savefig(f"{plot_filename}_lmi.pdf", **plot_opt)
-            lmi_plt.savefig(f"{plot_filename}_lmi.png", **plot_opt)
-            plt.close()
-            print("\t\tLMI Scatter plot -> done.")
+
 
             print("\t\tSignal plot", end="\r")
             bed_data, selmetaloci = plot.signal_bed(
@@ -271,6 +269,31 @@ def run(opts):
             sig_plt.savefig(f"{plot_filename}_signal.png", **plot_opt)
             plt.close()
             print("\t\tSignal plot -> done.")
+
+            print("\t\tLMI Scatter plot", end="\r")
+            lmi_plt, r_value, p_value = plot.get_lmi_scatterplot(
+                mlobject, merged_lmi_geometry, buffer * BFACT, signipval, colors
+            )
+
+            if lmi_plt is not None:
+
+                lmi_plt.savefig(f"{plot_filename}_lmi.pdf", **plot_opt)
+                lmi_plt.savefig(f"{plot_filename}_lmi.png", **plot_opt)
+                plt.close()
+
+                print("\t\tLMI Scatter plot -> done.")
+
+            else: 
+
+                if signal == signals[-1]:
+
+                    print("\t\tSkipping to next region...")
+                    continue
+
+                else:
+
+                    print("\t\tSkipping to next signal...")
+                    continue
 
             print(f"\t\tFinal composite figure for region '{region}' and signal '{signal}'", end="\r")
             img1 = Image.open(f"{plot_filename}_lmi.png")

diff --git a/metaloci/tools/layout.py b/metaloci/tools/layout.py
@@ -7,6 +7,7 @@
 from collections import defaultdict
 from datetime import timedelta
 from time import time
+import subprocess as sp
 
 import cooler
 import hicstraw
@@ -116,9 +117,10 @@ def populate_args(parser):
         help="Set a persistence length for the Kamada-Kawai layout.",
     )
 
-def get_region_layout(row, opts, silent: bool = True):
+def get_region_layout(row, opts, progress=None, silent: bool = True):
 
     work_dir = opts.work_dir
+    regions = opts.regions
     hic_path = opts.hic_file
     resolution = opts.reso
     cutoffs = opts.cutoff
@@ -130,6 +132,14 @@ def get_region_layout(row, opts, silent: bool = True):
 
         cutoffs = [0.2] 
 
+    if os.path.isfile(regions):
+
+        df_regions = pd.read_table(regions)
+
+    else:
+
+        df_regions = pd.DataFrame({"coords": [regions], "symbol": ["symbol"], "id": ["id"]})
+
     cutoffs.sort(key=float, reverse=True)
 
     bad_regions = defaultdict(list) # does not belong here, this is to prevent an error TODO
@@ -161,6 +171,8 @@ def get_region_layout(row, opts, silent: bool = True):
 
             print(f"\n------> Region {region_coords} already done (no data).")
 
+        if progress is not None: progress["done"] = True
+
         return
 
     else:
@@ -169,13 +181,15 @@ def get_region_layout(row, opts, silent: bool = True):
 
             print(f"\n------> Region {region_coords} already done.")
 
+        if progress is not None: progress["done"] = True
+
         if force:
 
             if silent == False:
 
                 print(
                     "\tForce option (-f) selected, recalculating "
-                    "the Kamada-Kawai layout (files will be overwritten)\n"
+                    "the Kamada-Kawai layout (files will be overwritten)"
                 )
 
             mlobject = mlo.MetalociObject(
@@ -213,9 +227,9 @@ def get_region_layout(row, opts, silent: bool = True):
 
         return
 
-    for cutoff in cutoffs:
+    time_per_region = time()
 
-        time_per_kk = time()
+    for cutoff in cutoffs:
 
         mlobject.kk_cutoff = cutoff
 
@@ -270,14 +284,14 @@ def get_region_layout(row, opts, silent: bool = True):
 
         if silent == False:
 
-            print(f"\tdone in {timedelta(seconds=round(time() - time_per_kk))}.\n")
+            print(f"\tdone in {timedelta(seconds=round(time() - time_per_region))}.\n")
 
         if len(cutoffs) == 1:
 
             if silent == False:
                 print(
-                    f"\tKamada-Kawai layout of region {mlobject.region} saved"
-                    f" at {int(cutoff * 100)} % cutoff to file: {mlobject.save_path}"
+                    f"\tKamada-Kawai layout of region {mlobject.region} "
+                    f"at {int(cutoff * 100)} % cutoff saved to file: {mlobject.save_path}"
                 )
 
             # Save mlobject.
@@ -318,6 +332,18 @@ def get_region_layout(row, opts, silent: bool = True):
         #             # if word not in file then write the word to the file
         #             handler.write(f"{region}\t{reason[0]}\n")
         #             handler.flush()
+        # print(int(timedelta(seconds=round(time() - time_per_region))))
+
+    if progress is not None:
+
+        progress['value'] += 1
+
+        time_spent = time() - progress['timer']
+        time_remaining = int(time_spent / progress['value'] * (len(df_regions) - progress['value']))
+
+        print(f"\033[A{'  '*int(sp.Popen(['tput','cols'], stdout=sp.PIPE).communicate()[0].strip())}\033[A")
+        print(f"\t[{progress['value']}/{len(df_regions)}] | Time spent: {timedelta(seconds=round(time_spent))} | "
+                f"ETR: {timedelta(seconds=round(time_remaining))}", end='\r')
 
 
 def run(opts):
@@ -359,13 +385,21 @@ def run(opts):
 
     if multiprocess:
 
-        print(f"{len(df_regions)} regions will be computed.\n")
+        print(f"\n------> {len(df_regions)} regions will be computed.\n")
 
         try:
 
+            manager = mp.Manager()
+            progress = manager.dict(value=0, timer = start_timer, done = False)
+
             with mp.Pool(processes=cores) as pool:
 
-                pool.starmap(get_region_layout, [(row, opts) for _, row in df_regions.iterrows()])
+                pool.starmap(get_region_layout, [(row, opts, progress) for _, row in df_regions.iterrows()])
+
+                if progress["done"] == True:
+
+                    print("\tSome regions had already been computed and have been skipped.", end="")
+
                 pool.close()
                 pool.join()
 
@@ -393,5 +427,5 @@ def run(opts):
     #             if not any(f"{row.region}\t{row.reason}" in line for line in handler)
     #         ]
 
-    print(f"\nTotal time spent: {timedelta(seconds=round(time() - start_timer))}.")
+    print(f"\n\nTotal time spent: {timedelta(seconds=round(time() - start_timer))}.")
     print("\nall done.")